From 6bd8cd95e908c3f9e3be2be4700fcdc3e9886d0e Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 9 May 2024 18:04:20 +0000 Subject: [PATCH 01/31] AIO API update --- csrc/aio/py_test/aio_bench_generate_param.py | 6 +- csrc/aio/py_test/aio_bench_perf_sweep.py | 133 ++++----------- csrc/aio/py_test/ds_aio_args.py | 168 +++++++++++++++++++ csrc/aio/py_test/ds_aio_basic.py | 27 +-- csrc/aio/py_test/ds_aio_handle.py | 108 ++++++++---- csrc/aio/py_test/ds_aio_job.py | 48 ++++++ csrc/aio/py_test/run_read_sweep.sh | 43 ++--- csrc/aio/py_test/test_ds_aio.py | 66 +------- csrc/aio/py_test/test_ds_aio_utils.py | 41 ++++- csrc/aio/py_test/validate_async_io.py | 1 + 10 files changed, 400 insertions(+), 241 deletions(-) create mode 100644 csrc/aio/py_test/ds_aio_args.py create mode 100644 csrc/aio/py_test/ds_aio_job.py diff --git a/csrc/aio/py_test/aio_bench_generate_param.py b/csrc/aio/py_test/aio_bench_generate_param.py index 09d0e03c7ef6..7a0ab59ed73d 100644 --- a/csrc/aio/py_test/aio_bench_generate_param.py +++ b/csrc/aio/py_test/aio_bench_generate_param.py @@ -41,9 +41,9 @@ def convert_to_param(key): return { "single_submit": "true" if key[0] == "single" else "false", "overlap_events": "true" if key[1] == "overlap" else "false", - "thread_count": int(key[3]), - "queue_depth": int(key[4]), - "block_size": int(key[5]) + "thread_count": int(key[5]), + "queue_depth": int(key[3]), + "block_size": int(key[4]) } diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py index 7d55f7ded65c..5d4172066a6b 100644 --- a/csrc/aio/py_test/aio_bench_perf_sweep.py +++ b/csrc/aio/py_test/aio_bench_perf_sweep.py @@ -10,12 +10,11 @@ import argparse import json import itertools -import subprocess import shutil -from test_ds_aio_utils import refine_integer_value +from ds_aio_job import Job, run_job from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \ - READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR + READ_LOG_DIR, WRITE_LOG_DIR from deepspeed.ops.op_builder import AsyncIOBuilder OTHER_OPTIONS = '--handle' @@ -23,62 +22,34 @@ DEFAULT_SWEEP_CONFIG = { "block_size": ["128K", "256K"], "queue_depth": [4, 16, 32], - "overlap_events": [True, False], - "io_parallel": [2, 8], - "single_submit": [False] + "sequential_requests": [True, False], + "single_submit": [False], + "io_parallel": [2, 8] } -class Job(object): - - def __init__(self, cmd_line, output_file=None, work_dir=None): - self.cmd_line = cmd_line - self.output_file = output_file - self.work_dir = work_dir - self.output_fd = None - - def cmd(self): - return self.cmd_line - - def get_stdout(self): - return self.output_fd - - def get_stderr(self): - return self.output_fd - - def get_cwd(self): - return self.work_dir - - def open_output_file(self): - if self.output_file is not None: - self.output_fd = open(self.output_file, 'w') - - def close_output_file(self): - if self.output_fd is not None: - self.output_fd.close() - self.output_fd = None - - class SweepConfig(object): def __init__(self, args): - self.nvme_dir = args.nvme_dir - self.io_size = args.io_size + self.folder_to_device_mapping = get_ftd_map(args.nvme_dir) self.search_space = get_sweep_config_dict(args.sweep_config) + self.search_space.update(self.folder_to_device_mapping) self.read = not args.no_read self.write = not args.no_write self.flush_cache = not args.no_sudo self.log_dir = args.log_dir - self.loops = args.loops - self.other_options = f'{OTHER_OPTIONS} --loops {args.loops}' + if args.gpu: + self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size} --gpu' + else: + self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}' def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--nvme_dir', + nargs='+', required=True, - type=str, help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.') parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.') @@ -92,6 +63,8 @@ def parse_arguments(): default="400M", help='Number of I/O bytes to read/write for performance measurements.') + parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.') + parser.add_argument( '--no_sudo', action='store_true', @@ -118,6 +91,11 @@ def dump_cmd_lines(cmd_lines): print(f'{i}: {cmd}') +def get_ftd_map(nvme_dir_list): + dir_list = [' '.join(nvme_dir_list[:(i + 1)]) for i in range(len(nvme_dir_list))] + return {'folder_to_device_mapping': dir_list} + + def get_sweep_config_dict(sweep_config_json): if sweep_config_json is None: return DEFAULT_SWEEP_CONFIG @@ -148,16 +126,6 @@ def flatten_options(key, value_list): return cmd_list -def run_job(job): - args = ' '.join(job.cmd()) - print(f'args = {args}') - job.open_output_file() - proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd()) - job.close_output_file() - assert proc.returncode == 0, \ - f"This command failed: {job.cmd()}" - - def launch_sweep(sweep_jobs, sync_job, flush_cache_job): for perf_job in sweep_jobs: if flush_cache_job is not None: @@ -176,7 +144,12 @@ def create_cmd_tags(cmd_line): if len(fields) == 1: tags[fields[0]] = None elif len(fields) == 2: - tags[fields[0]] = fields[1] + if fields[0] == '--folder_to_device_mapping': + tags[fields[0]] = len(fields[1:]) + else: + tags[fields[0]] = fields[1] + elif len(fields) > 2: + tags[fields[0]] = len(fields[1:]) return tags @@ -184,16 +157,16 @@ def get_log_file(io_op_desc, cmd_line): QUEUE_DEPTH = "--queue_depth" BLOCK_SIZE = "--block_size" SINGLE_SUBMIT = "--single_submit" - OVERLAP_EVENTS = "--overlap_events" - THREAD_COUNT = "--threads" + SEQUENTIAL_REQUESTS = "--sequential_requests" + FTD_MAP = "--folder_to_device_mapping" IO_PARALLEL = "--io_parallel" tag_map = { QUEUE_DEPTH: "d", BLOCK_SIZE: "bs", SINGLE_SUBMIT: "single", - OVERLAP_EVENTS: "overlap", - THREAD_COUNT: "t", + SEQUENTIAL_REQUESTS: "sequential", + FTD_MAP: "ftd", IO_PARALLEL: "p" } @@ -201,14 +174,14 @@ def get_log_file(io_op_desc, cmd_line): QUEUE_DEPTH: 1, BLOCK_SIZE: "1M", SINGLE_SUBMIT: "block", - OVERLAP_EVENTS: "sequential", - THREAD_COUNT: 1, + SEQUENTIAL_REQUESTS: "overlap", + FTD_MAP: 1, IO_PARALLEL: 1 } def get_default_value(tag): value = tag_default[tag] - if tag in [SINGLE_SUBMIT, OVERLAP_EVENTS]: + if tag in [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS]: return value return f'{tag_map[tag]}{value}' @@ -218,7 +191,7 @@ def get_config_value(tag, value): return tag_key return f'{tag_key}{value}' - tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE] + tag_list = [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS, FTD_MAP, QUEUE_DEPTH, BLOCK_SIZE, IO_PARALLEL] log_tags = [io_op_desc] cmd_tags = create_cmd_tags(cmd_line) for tag in tag_list: @@ -252,40 +225,14 @@ def async_io_setup(): return AsyncIOBuilder().is_compatible() -def get_block_size_and_count(io_bytes): - block_size = 1 - block_count = io_bytes - bytes_in_KB = 1024 - - while block_count % bytes_in_KB == 0: - block_size *= bytes_in_KB - block_count /= bytes_in_KB - - return int(block_size), int(block_count) - - -def create_read_file(sweep_config): - read_folder = os.path.join(sweep_config.nvme_dir, f'{READ_IO_DIR}') - os.makedirs(read_folder, exist_ok=True) - read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt') - block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size)) - dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}']) - print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....') - run_job(dd_job) - print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....') - return read_folder, read_file_name - - def remove_folder(folder): assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found" shutil.rmtree(folder) def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines): - read_folder, read_file_name = create_read_file(sweep_config) - read_option = f'--read_file {read_file_name}' - read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines] - #dump_cmd_lines(read_cmd_lines) + read_cmd_lines = [[f'--read {sweep_config.other_options}'] + cmd for cmd in cmd_lines] + #dump_cmd_lines(cmd_lines) log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}') os.makedirs(log_folder, exist_ok=True) @@ -294,15 +241,9 @@ def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines): launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job) - remove_folder(read_folder) - def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines): - write_folder = os.path.join(sweep_config.nvme_dir, f'{WRITE_IO_DIR}') - os.makedirs(write_folder, exist_ok=True) - write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt') - write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}' - write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines] + write_cmd_lines = [[f'{sweep_config.other_options}'] + cmd for cmd in cmd_lines] #dump_cmd_lines(write_cmd_lines) log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}') @@ -312,8 +253,6 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines): launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job) - remove_folder(write_folder) - def main(): print("Running performance sweep of deepspeed nvme library") diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py new file mode 100644 index 000000000000..66d843d68ea2 --- /dev/null +++ b/csrc/aio/py_test/ds_aio_args.py @@ -0,0 +1,168 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +""" +Functionality of swapping optimizer tensors to/from (NVMe) storage devices. +""" + +import argparse +import os +from test_ds_aio_utils import refine_integer_value +from deepspeed.accelerator import get_accelerator + +MAPPING_DELIMITER = ':' + + +def refine_args(args): + if args.io_size and type(args.io_size) == str: + args.io_size = refine_integer_value(args.io_size) + + if args.block_size and type(args.block_size) == str: + args.block_size = refine_integer_value(args.block_size) + + return args + + +def _get_mapping_dict(args): + if args.folder is not None: + d = {i: args.folder for i in range(args.multi_process)} + else: + d = {} + for m in args.folder_to_device_mapping: + fields = m.split(MAPPING_DELIMITER) + d[fields[1]] = fields[0] + + return d + + +def _validate_folder_mapping(args): + no_error = True + error_messages = [] + invalid_mappings = [m for m in args.folder_to_device_mapping if MAPPING_DELIMITER not in m] + if len(invalid_mappings) > 0: + error_messages.append( + f'Missing delimiter ({MAPPING_DELIMITER}) in folder_to_device_mapping {invalid_mappings}') + no_error = False + + folder_list = [m.split(MAPPING_DELIMITER)[0] for m in args.folder_to_device_mapping] + invalid_folders = [d for d in folder_list if not os.path.exists(d)] + if len(invalid_folders) > 0: + error_messages.append(f'Invalid folders in folder_to_device_mapping: {invalid_folders}') + no_error = False + + if args.gpu: + device_list = [int(m.split(MAPPING_DELIMITER)[1]) for m in args.folder_to_device_mapping] + invalid_device_list = [dev_id for dev_id in device_list if not dev_id < get_accelerator().device_count()] + if len(invalid_device_list) > 0: + error_messages.append(f'Invalid device ids in folder_to_device_mapping: {invalid_device_list}') + no_error = False + + return no_error, error_messages + + +def validate_args(args): + no_error = True + error_messages = [] + + if args.folder is not None and len(args.folder_to_device_mapping) > 0: + error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.') + no_error = False + elif args.folder is None and len(args.folder_to_device_mapping) == 0: + error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.') + no_error = False + + # Validate --folder + if args.folder is not None and not os.path.exists(args.folder): + no_error = False + error_messages.append(f'Invalid folder in --folder: {args.folder} ') + + # Validate --folder_mapping_to_device + if len(args.folder_to_device_mapping) > 0: + no_mapping_error, mapping_error_messages = _validate_folder_mapping(args) + no_error = no_error and no_mapping_error + error_messages += mapping_error_messages + + if not no_error: + print(f'Found {len(error_messages)} validation errors') + for i, msg in enumerate(error_messages): + print(f'{i+1}: {msg}') + + return no_error + + +def parse_arguments(): + parser = argparse.ArgumentParser() + + parser.add_argument('--folder', default=None, type=str, help='Folder to use for I/O.') + + parser.add_argument('--folder_to_device_mapping', + default=[], + nargs='+', + help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).' + 'Can be specified multiple times for multi-process runs,' + 'e.g. --path_map /mnt/nvme0:0 --path_map /mnt/nvme1:15 --gpu' + 'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15') + + parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.') + + parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)') + + parser.add_argument('--multi_process', + type=int, + default=1, + help='Number of parallel processes doing I/O (default 1).') + + parser.add_argument('--block_size', + type=str, + default='1M', + help='I/O block size. Can use K, M, or G suffix (default 1M for 1 megabytes).') + + parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth (default 32).') + + parser.add_argument('--single_submit', + action='store_true', + help='Submit I/O requests in singles (default is submit queue_depth amount at once.).') + + parser.add_argument( + '--sequential_requests', + action='store_true', + help= + 'Delay I/O request submission until completion of prior requests (default is overlap I/O submission and completion requests.).' + ) + + parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.') + + parser.add_argument('--handle', action='store_true', help='Use AIO handle.') + + parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions') + + parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism') + + parser.add_argument('--gpu', action='store_true', help='Use GPU memory') + + parser.add_argument('--slow_bounce_buffer', + action='store_true', + help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.') + + args = parser.parse_args() + print(f'args = {args}') + return args + + +def get_validated_args(): + args = parse_arguments() + args = refine_args(args) + if not validate_args(args): + quit() + print(f'Successful validation of command line arguments') + + peer_tag = 'gpu' if args.gpu else 'process' + args.mapping_dict = _get_mapping_dict(args) + args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()] + assert len(args.mapping_dict) == len(args.mapping_list) + print(f'Configuring {len(args.mapping_list)} {peer_tag} to folder mapping') + for i, (device_id, folder) in enumerate(args.mapping_list): + print(f'[{i}]: {peer_tag} {device_id} <----> {folder}') + + return args diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py index ad2a4349cd0c..e90886e17871 100755 --- a/csrc/aio/py_test/ds_aio_basic.py +++ b/csrc/aio/py_test/ds_aio_basic.py @@ -9,10 +9,9 @@ import torch import os import time +from deepspeed.ops.aio import AsyncIOBuilder from multiprocessing import Pool, Barrier from test_ds_aio_utils import report_results, task_log, task_barrier -from deepspeed.accelerator import get_accelerator -from deepspeed.ops.op_builder import AsyncIOBuilder def pre_basic(args, tid, read_op): @@ -21,7 +20,7 @@ def pre_basic(args, tid, read_op): file = args.read_file if read_op else f'{args.write_file}.{tid}' task_log(tid, f'Allocate tensor of size {num_bytes} bytes') - buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu')) + buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory() task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}') ctxt = {} @@ -56,7 +55,7 @@ def main_basic_read(pool_params): args, tid, ctxt = pool_params start_time = time.time() AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth, - args.single_submit, args.overlap_events, args.validate) + args.single_submit, not args.sequential_requests, args.validate) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time @@ -67,7 +66,7 @@ def main_basic_write(pool_params): args, tid, ctxt = pool_params start_time = time.time() AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth, - args.single_submit, args.overlap_events, args.validate) + args.single_submit, not args.sequential_requests, args.validate) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time @@ -90,16 +89,17 @@ def get_schedule(args, read_op): def _aio_handle_tasklet(pool_params): args, tid, read_op = pool_params + num_processes = len(args.mapping_dict) # Create schedule schedule = get_schedule(args, read_op) task_log(tid, f'schedule = {schedule}') - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) # Run pre task task_log(tid, f'running pre-task') ctxt = schedule["pre"]((args, tid)) - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) # Run main tasks in a loop ctxt["main_task_sec"] = 0 @@ -107,27 +107,28 @@ def _aio_handle_tasklet(pool_params): task_log(tid, f'running main task {i}') start_time = time.time() ctxt = schedule["main"]((args, tid, ctxt)) - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) stop_time = time.time() ctxt["main_task_sec"] += stop_time - start_time # Run post task task_log(tid, f'running post-task') ctxt = schedule["post"]((args, tid, ctxt)) - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops -def _init_tasklet(b): +def _init_takslet(b): global aio_barrier aio_barrier = b def aio_basic_multiprocessing(args, read_op): - b = Barrier(args.threads) - pool_params = [(args, p, read_op) for p in range(args.threads)] - with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p: + num_processes = len(args.mapping_dict) + b = Barrier(num_processes) + pool_params = [(args, p, read_op) for p in range(num_processes)] + with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p: pool_results = p.map(_aio_handle_tasklet, pool_params) report_results(args, read_op, pool_results) diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index d35b2713edae..369cb9d4030f 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -10,40 +10,48 @@ import os import time from multiprocessing import Pool, Barrier -from test_ds_aio_utils import report_results, task_log, task_barrier +from deepspeed.ops.aio import AsyncIOBuilder +from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file from deepspeed.accelerator import get_accelerator -from deepspeed.ops.op_builder import AsyncIOBuilder + +BUFFER = 'buffer' +BOUNCE_BUFFER = 'bounce_buffer' def pre_handle(args, tid, read_op): io_string = "Read" if read_op else "Write" - num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size - file = args.read_file if read_op else f'{args.write_file}.{tid}' - - io_parallel = args.io_parallel if args.io_parallel else 1 - handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, - args.overlap_events, io_parallel) - task_log(tid, f'Created deepspeed aio handle') + device_id, folder = args.mapping_list[tid] + filename = create_filename(folder, args.read, args.io_size, tid) + if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size): + create_file(filename, args.io_size) + task_log(tid, f'Allocate tensor of size {args.io_size} bytes') + bounce_buffer = None if args.gpu: - buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name()) + device_name = get_accelerator().device_name(device_id) + buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name) + if not args.slow_bounce_buffer: + bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, + device='cpu').pin_memory() else: - if args.use_accelerator_pin_memory: - buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu')) - else: - buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8)) + buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory() + task_log(tid, + f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}', + force=True) - task_log(tid, f'Allocate tensor of size {num_bytes} bytes') + io_parallel = args.io_parallel if args.io_parallel else 1 + handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, + not args.sequential_requests, io_parallel) + task_log(tid, f'created deepspeed aio handle') ctxt = {} - ctxt['file'] = file - ctxt['num_bytes'] = num_bytes + ctxt['file'] = filename + ctxt['num_bytes'] = args.io_size ctxt['handle'] = handle - ctxt['buffer'] = buffer + ctxt[BUFFER] = buffer + ctxt[BOUNCE_BUFFER] = bounce_buffer ctxt['elapsed_sec'] = 0 - task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}') - return ctxt @@ -61,8 +69,10 @@ def pre_handle_write(pool_params): def post_handle(pool_params): _, _, ctxt = pool_params - ctxt["buffer"].detach() - ctxt["buffer"] = None + for buf in [BUFFER, BOUNCE_BUFFER]: + if ctxt[buf] is not None: + ctxt[buf].detach() + ctxt[buf] = None return ctxt @@ -71,9 +81,12 @@ def main_parallel_read(pool_params): handle = ctxt['handle'] start_time = time.time() - ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True) + dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER + ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True) assert ret != -1 handle.wait() + if dest_buffer == BOUNCE_BUFFER: + ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time @@ -82,9 +95,18 @@ def main_parallel_read(pool_params): def main_parallel_write(pool_params): args, tid, ctxt = pool_params + # Avoid overwriting existing files as it could be artificially faster + if os.path.isfile(ctxt['file']): + os.remove(ctxt['file']) + handle = ctxt['handle'] start_time = time.time() - ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True) + if ctxt[BOUNCE_BUFFER] is not None: + source_buffer = BOUNCE_BUFFER + ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data) + else: + source_buffer = BUFFER + ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True) assert ret != -1 handle.wait() end_time = time.time() @@ -98,8 +120,11 @@ def main_handle_read(pool_parms): handle = ctxt['handle'] start_time = time.time() - ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate) + dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER + ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate) assert ret != -1 + if dest_buffer == BOUNCE_BUFFER: + ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time @@ -108,9 +133,18 @@ def main_handle_read(pool_parms): def main_handle_write(pool_parms): args, tid, ctxt = pool_parms + # Avoid overwriting existing files as it could be artificially faster + if os.path.isfile(ctxt['file']): + os.remove(ctxt['file']) + handle = ctxt['handle'] start_time = time.time() - ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate) + if ctxt[BOUNCE_BUFFER] is not None: + source_buffer = BOUNCE_BUFFER + ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data) + else: + source_buffer = BUFFER + ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate) assert ret != -1 end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time @@ -123,27 +157,28 @@ def get_schedule(args, read_op): if read_op: schedule['pre'] = pre_handle_read schedule['post'] = post_handle - schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read + schedule['main'] = main_parallel_read else: schedule['pre'] = pre_handle_write schedule['post'] = post_handle - schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write + schedule['main'] = main_parallel_write return schedule def _aio_handle_tasklet(pool_params): args, tid, read_op = pool_params + num_processes = len(args.mapping_dict) # Create schedule schedule = get_schedule(args, read_op) task_log(tid, f'schedule = {schedule}') - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) # Run pre task task_log(tid, f'running pre-task') ctxt = schedule["pre"]((args, tid)) - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) # Run main tasks in a loop ctxt["main_task_sec"] = 0 @@ -151,27 +186,28 @@ def _aio_handle_tasklet(pool_params): task_log(tid, f'running main task {i}') start_time = time.time() ctxt = schedule["main"]((args, tid, ctxt)) - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) stop_time = time.time() ctxt["main_task_sec"] += stop_time - start_time # Run post task task_log(tid, f'running post-task') ctxt = schedule["post"]((args, tid, ctxt)) - task_barrier(aio_barrier, args.threads) + task_barrier(aio_barrier, num_processes) return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops -def _init_tasklet(b): +def _init_takslet(b): global aio_barrier aio_barrier = b def aio_handle_multiprocessing(args, read_op): - b = Barrier(args.threads) - pool_params = [(args, p, read_op) for p in range(args.threads)] - with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p: + num_processes = len(args.mapping_dict) + b = Barrier(num_processes) + pool_params = [(args, p, read_op) for p in range(num_processes)] + with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p: pool_results = p.map(_aio_handle_tasklet, pool_params) report_results(args, read_op, pool_results) diff --git a/csrc/aio/py_test/ds_aio_job.py b/csrc/aio/py_test/ds_aio_job.py new file mode 100644 index 000000000000..bbddee1bf26d --- /dev/null +++ b/csrc/aio/py_test/ds_aio_job.py @@ -0,0 +1,48 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +""" +Functionality of swapping tensors to/from (NVMe) storage devices. +""" +import subprocess + + +class Job(object): + + def __init__(self, cmd_line, output_file=None, work_dir=None): + self.cmd_line = cmd_line + self.output_file = output_file + self.work_dir = work_dir + self.output_fd = None + + def cmd(self): + return self.cmd_line + + def get_stdout(self): + return self.output_fd + + def get_stderr(self): + return self.output_fd + + def get_cwd(self): + return self.work_dir + + def open_output_file(self): + if self.output_file is not None: + self.output_fd = open(self.output_file, 'w') + + def close_output_file(self): + if self.output_fd is not None: + self.output_fd.close() + self.output_fd = None + + +def run_job(job): + args = ' '.join(job.cmd()) + print(f'args = {args}') + job.open_output_file() + proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd()) + job.close_output_file() + assert proc.returncode == 0, \ + f"This command failed: {job.cmd()}" diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index b9d7e050454a..d69aa13e49da 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -1,13 +1,13 @@ #!/bin/bash -if [[ $# -ne 2 ]]; then - echo "Usage: $0 " +if [[ $# -lt 2 ]]; then + echo "Usage: $0 " exit 1 fi function validate_environment() { - validate_cmd="python ./validate_async_io.py" + validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python ./validate_async_io.py" eval ${validate_cmd} res=$? if [[ $res != 0 ]]; then @@ -20,15 +20,11 @@ function validate_environment() validate_environment -INPUT_FILE=$1 -if [[ ! -f ${INPUT_FILE} ]]; then - echo "Input file not found: ${INPUT_FILE}" - exit 1 -fi - +IO_SIZE=$1 LOG_DIR=$2/aio_perf_sweep +GPU_MEM=$3 RUN_SCRIPT=./test_ds_aio.py -READ_OPT="--read_file ${INPUT_FILE}" +READ_OPT="--read" if [[ -d ${LOG_DIR} ]]; then rm -f ${LOG_DIR}/* @@ -36,34 +32,41 @@ else mkdir -p ${LOG_DIR} fi -DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' " +if [[ ${GPU_MEM} == "gpu" ]]; then + gpu_opt="--gpu" +else + gpu_opt="" +fi + +DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' " SYNC="sync" for sub in single block; do + ftd_map="--folder_to_device_mapping \ + /workspace/nvme01/aio:0 " if [[ $sub == "single" ]]; then sub_opt="--single_submit" else sub_opt="" fi for ov in overlap sequential; do - if [[ $ov == "overlap" ]]; then - ov_opt="--overlap_events" + if [[ $ov == "sequential" ]]; then + ov_opt="--sequential_requests" else ov_opt="" fi - for t in 1 2 4 8; do - for p in 1 ; do - for d in 1 2 4 8 16 32; do - for bs in 128K 256K 512K 1M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}" - OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}" + for p in 1 ; do + for t in 1 2 4 8; do + for d in 8 16 32; do + for bs in 256K 512K 1M; do + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} ${ftd_map}" + OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE}" LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" echo ${DISABLE_CACHE} echo ${cmd} echo ${SYNC} - eval ${DISABLE_CACHE} eval ${cmd} eval ${SYNC} sleep 2 diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py index e6242cb35789..6de72755e9e5 100755 --- a/csrc/aio/py_test/test_ds_aio.py +++ b/csrc/aio/py_test/test_ds_aio.py @@ -6,79 +6,19 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. """ -import os -import argparse import multiprocessing as mp from ds_aio_basic import aio_basic_multiprocessing from ds_aio_handle import aio_handle_multiprocessing -from test_ds_aio_utils import refine_args - - -def parse_arguments(): - parser = argparse.ArgumentParser() - - parser.add_argument('--read_file', type=str, default=None, help='Read file.') - - parser.add_argument('--write_file', type=str, default=None, help='Write file.') - - parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.') - - parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.') - - parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.') - - parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.') - - parser.add_argument('--single_submit', - action='store_true', - help='Submit I/O requests in singles (default is submit queue_depth amount at once.).') - - parser.add_argument('--overlap_events', - action='store_true', - help='Overlap I/O submission and completion requests.') - - parser.add_argument('--validate', action='store_true', help='Perform validation in library.') - - parser.add_argument('--handle', action='store_true', help='Use AIO handle.') - - parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions') - - parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism') - - parser.add_argument('--gpu', action='store_true', help='Use GPU memory') - - parser.add_argument('--use_accelerator_pin_memory', - action='store_true', - help='Obtain pinned (CPU page-locked) tensors from accelerator') - - args = parser.parse_args() - print(f'args = {args}') - return args - - -def validate_args(args): - if args.read_file and not os.path.isfile(args.read_file): - print(f'args validation error: {args.read_file} not found') - return False - - return True +from ds_aio_args import get_validated_args def main(): print(f'Testing deepspeed_aio python frontend') - args = parse_arguments() - refine_args(args) - if not validate_args(args): - quit() - + args = get_validated_args() mp.set_start_method('spawn') multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing - if args.read_file: - multiprocess_function(args, True) - - if args.write_file: - multiprocess_function(args, False) + multiprocess_function(args, args.read) if __name__ == "__main__": diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py index 6aad114c0bdc..968ff4a60ef9 100755 --- a/csrc/aio/py_test/test_ds_aio_utils.py +++ b/csrc/aio/py_test/test_ds_aio_utils.py @@ -6,12 +6,17 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. """ +import os +from ds_aio_job import Job, run_job + BYTES_PER_GB = 1024**3 +BYTES_PER_MB = 1024**2 +BYTES_PER_KB = 1024 LOG_TIDS = [0] -def task_log(tid, msg): - if tid in LOG_TIDS: +def task_log(tid, msg, force=False): + if force or tid in LOG_TIDS: print(f'tid {tid}: {msg}') @@ -31,16 +36,29 @@ def report_results(args, read_op, pool_results): total_bytes = sum([num_bytes for _, _, num_bytes in pool_results]) task_latency_sec = max([sec for _, sec, _ in pool_results]) - task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB + task_speed_GB = 0 if task_latency_sec == 0 else total_bytes / task_latency_sec / BYTES_PER_GB print(f'Task {io_string} Latency = {task_latency_sec} sec') print(f'Task {io_string} Speed = {task_speed_GB} GB/sec') e2e_latency_sec = max([sec for sec, _, _ in pool_results]) - e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB + e2e_speed_GB = 0 if e2e_latency_sec == 0 else total_bytes / e2e_latency_sec / BYTES_PER_GB print(f'E2E {io_string} Latency = {e2e_latency_sec} sec') print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec') +def get_block_size_and_count(io_bytes): + if io_bytes > BYTES_PER_MB and io_bytes % BYTES_PER_MB == 0: + block_size = BYTES_PER_MB + block_size_string = '1M' + else: + assert io_bytes % BYTES_PER_KB == 0 + block_size = BYTES_PER_KB + block_size_string = '1K' + block_count = io_bytes / block_size + + return block_size_string, int(block_count) + + def refine_integer_value(value): unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3} @@ -50,9 +68,14 @@ def refine_integer_value(value): return int(value) -def refine_args(args): - if args.write_size and type(args.write_size) == str: - args.write_size = refine_integer_value(args.write_size) +def create_filename(folder, read_op, size, tid): + io_string = "read" if read_op else "write" + return os.path.join(folder, f'_aio_{io_string}_{size}.pt.{tid}') + - if args.block_size and type(args.block_size) == str: - args.block_size = refine_integer_value(args.block_size) +def create_file(filename, num_bytes): + block_size, block_count = get_block_size_and_count(num_bytes) + dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={filename} bs={block_size} count={block_count}']) + print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....') + run_job(dd_job) + print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....') diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py index 019ec05d49d3..10fb638347bc 100644 --- a/csrc/aio/py_test/validate_async_io.py +++ b/csrc/aio/py_test/validate_async_io.py @@ -7,3 +7,4 @@ """ from deepspeed.ops.op_builder import AsyncIOBuilder assert AsyncIOBuilder().is_compatible() +assert AsyncIOBuilder().load() From 3abe3835b6fb3e7e7e7a38977af9059000dc2378 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Tue, 14 May 2024 19:03:08 +0000 Subject: [PATCH 02/31] fixing sweep scripts --- csrc/aio/py_test/run_read_sweep.sh | 14 ++++----- csrc/aio/py_test/run_write_sweep.sh | 47 +++++++++++++++++------------ 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index d69aa13e49da..56c81f41eb70 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -22,6 +22,7 @@ validate_environment IO_SIZE=$1 LOG_DIR=$2/aio_perf_sweep +MAP_DIR=$2/aio GPU_MEM=$3 RUN_SCRIPT=./test_ds_aio.py READ_OPT="--read" @@ -42,8 +43,6 @@ DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' " SYNC="sync" for sub in single block; do - ftd_map="--folder_to_device_mapping \ - /workspace/nvme01/aio:0 " if [[ $sub == "single" ]]; then sub_opt="--single_submit" else @@ -55,21 +54,22 @@ for sub in single block; do else ov_opt="" fi - for p in 1 ; do + for p in 1 2 4 8; do for t in 1 2 4 8; do - for d in 8 16 32; do + for d in 16 32 64; do for bs in 256K 512K 1M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} ${ftd_map}" - OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE}" + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}" + OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" echo ${DISABLE_CACHE} echo ${cmd} echo ${SYNC} + eval ${DISABLE_CACHE} eval ${cmd} eval ${SYNC} - sleep 2 + sleep 1 done done done diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh index 99f2113dda6f..d8abc6869c50 100755 --- a/csrc/aio/py_test/run_write_sweep.sh +++ b/csrc/aio/py_test/run_write_sweep.sh @@ -25,25 +25,32 @@ function validate_environment() validate_environment -if [[ $# -ne 3 ]]; then - echo "Usage: $0 " +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " exit 1 fi -SIZE="$1M" -WRITE_DIR=$2 -LOG_DIR=$3/aio_perf_sweep +IO_SIZE=$1 +LOG_DIR=$2/aio_perf_sweep +MAP_DIR=$2/aio +GPU_MEM=$3 +RUN_SCRIPT=./test_ds_aio.py -OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt -WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}" +OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt +WRITE_OPT="" -prep_folder ${WRITE_DIR} +prep_folder ${MAP_DIR} prep_folder ${LOG_DIR} -RUN_SCRIPT=./test_ds_aio.py -DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' " +if [[ ${GPU_MEM} == "gpu" ]]; then + gpu_opt="--gpu" +else + gpu_opt="" +fi + +DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' " SYNC="sync" for sub in single block; do @@ -53,19 +60,19 @@ for sub in single block; do sub_opt="" fi for ov in overlap sequential; do - if [[ $ov == "overlap" ]]; then - ov_opt="--overlap_events" + if [[ $ov == "sequential" ]]; then + ov_opt="--sequential_requests" else ov_opt="" fi - for t in 1 2 4 8; do - for p in 1; do - for d in 1 2 4 8 16 32; do - for bs in 128K 256K 512K 1M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}" - OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}" + for p in 1 2 4 8; do + for t in 1 2 4 8; do + for d in 16 32 64; do + for bs in 256K 512K 1M; do + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}" + OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" - cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" + cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" echo ${DISABLE_CACHE} echo ${cmd} echo ${SYNC} @@ -73,7 +80,7 @@ for sub in single block; do eval ${DISABLE_CACHE} eval ${cmd} eval ${SYNC} - sleep 2 + sleep 1 done done done From 1b5dd7462fc01c2e4bf97465482d18606bfc7adf Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Fri, 17 May 2024 21:44:27 +0000 Subject: [PATCH 03/31] adding gds op --- csrc/aio/py_lib/deepspeed_aio_op_desc.cpp | 38 +++++ csrc/aio/py_lib/deepspeed_aio_op_desc.h | 41 +++++ csrc/aio/py_lib/deepspeed_aio_thread.cpp | 55 +------ csrc/aio/py_lib/deepspeed_aio_thread.h | 24 +-- csrc/aio/py_lib/deepspeed_cpu_op.cpp | 62 ++++++++ csrc/aio/py_lib/deepspeed_cpu_op.h | 30 ++++ csrc/aio/py_lib/deepspeed_gds_op.cpp | 162 ++++++++++++++++++++ csrc/aio/py_lib/deepspeed_gds_op.h | 44 ++++++ csrc/aio/py_lib/deepspeed_gds_utils.h | 91 +++++++++++ csrc/aio/py_lib/deepspeed_py_aio.cpp | 3 - csrc/aio/py_lib/deepspeed_py_aio.h | 5 +- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 62 ++++++-- csrc/aio/py_lib/deepspeed_py_aio_handle.h | 8 + csrc/aio/py_lib/deepspeed_py_copy.cpp | 2 +- csrc/aio/py_lib/deepspeed_py_copy.h | 3 - csrc/aio/py_lib/py_ds_aio.cpp | 13 +- csrc/aio/py_test/ds_aio_args.py | 7 + csrc/aio/py_test/ds_aio_handle.py | 7 +- csrc/aio/py_test/run_read_sweep.sh | 8 +- csrc/aio/py_test/run_write_sweep.sh | 13 +- op_builder/async_io.py | 50 +++--- 21 files changed, 596 insertions(+), 132 deletions(-) create mode 100644 csrc/aio/py_lib/deepspeed_aio_op_desc.cpp create mode 100644 csrc/aio/py_lib/deepspeed_aio_op_desc.h create mode 100644 csrc/aio/py_lib/deepspeed_cpu_op.cpp create mode 100644 csrc/aio/py_lib/deepspeed_cpu_op.h create mode 100644 csrc/aio/py_lib/deepspeed_gds_op.cpp create mode 100644 csrc/aio/py_lib/deepspeed_gds_op.h create mode 100644 csrc/aio/py_lib/deepspeed_gds_utils.h diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp new file mode 100644 index 000000000000..5c9bb033c0c2 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include "deepspeed_aio_op_desc.h" + +using namespace std; + +io_op_desc_t::io_op_desc_t(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate) + : _read_op(read_op), + _buffer(buffer), + _fd(fd), + _filename(filename), + _file_num_bytes(file_num_bytes), + _num_threads(num_threads), + _num_bytes_per_thread(file_num_bytes / num_threads), + _validate(validate) +{ +} + +char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } + +void io_op_desc_t::fini() {} + +void io_op_desc_t::validate() {} + +void io_op_desc_t::run(const int tid, + std::unique_ptr& aio_ctxt, + deepspeed_aio_config_t* aio_config) +{ +} diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h new file mode 100644 index 000000000000..c5bffae10265 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#ifndef _IO_OP_DESC_T_ +#define _IO_OP_DESC_T_ +#include +#include +#include "deepspeed_py_aio.h" + +struct io_op_desc_t { + const bool _read_op; + torch::Tensor _buffer; + int _fd; + const std::string _filename; + const long long int _file_num_bytes; + const int _num_threads; + const int _num_bytes_per_thread; + torch::Tensor _contiguous_buffer; + const bool _validate; + + io_op_desc_t(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate); + + virtual void run(const int tid, + std::unique_ptr& aio_ctxt, + deepspeed_aio_config_t* aio_config); + + virtual char* data_ptr() const; + + virtual void validate(); + + virtual void fini(); +}; +#endif // _IO_OP_DESC_T_ diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp index c852711a28c0..30c3b4914397 100644 --- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp +++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp @@ -9,50 +9,8 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. #include "deepspeed_aio_thread.h" -#if defined(__ENABLE_CANN__) -#include "torch_npu/csrc/framework/utils/OpAdapter.h" -#include "torch_npu/csrc/framework/utils/UtilForOpAdapter.h" -#endif - using namespace std; -io_op_desc_t::io_op_desc_t(const bool read_op, - const torch::Tensor& buffer, - const int fd, - const char* filename, - const long long int num_bytes, - const bool validate) - : _read_op(read_op), - _buffer(buffer), - _fd(fd), - _filename(filename), - _num_bytes(num_bytes), - _validate(validate) -{ - _cpu_buffer = (_buffer.is_cuda() || _buffer.is_xpu() -#if defined(__ENABLE_CANN__) - || torch_npu::utils::is_npu(_buffer) -#endif - ) - ? _buffer.to(torch::kCPU).pin_memory() - : _buffer; - _contiguous_buffer = _cpu_buffer.contiguous(); -} - -char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } - -void io_op_desc_t::fini() -{ - if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } - if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); } -#if defined(__ENABLE_CANN__) - if (_read_op && torch_npu::utils::is_npu(_buffer)) { - auto device = at::Device("npu:0"); - _buffer.copy_(_cpu_buffer.to(device)); - } -#endif -} - deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config) : _tid(tid), _aio_config(aio_config), @@ -79,18 +37,7 @@ void deepspeed_aio_thread_t::run() } if (next_io_op) { - const auto base_offset = next_io_op->_num_bytes * _tid; - - std::unique_ptr xfer_ctxt(new io_xfer_ctxt( - next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr())); - - if (_aio_config._overlap_events) { - do_aio_operation_overlap( - next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); - } else { - do_aio_operation_sequential( - next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); - } + next_io_op->run(_tid, _aio_ctxt, &_aio_config); { std::lock_guard lock(_complete_sync._mutex); diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h index 20799ecbb018..3cb3c5c3731f 100644 --- a/csrc/aio/py_lib/deepspeed_aio_thread.h +++ b/csrc/aio/py_lib/deepspeed_aio_thread.h @@ -10,28 +10,8 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. #include #include #include -#include "deepspeed_py_aio.h" - -struct io_op_desc_t { - const bool _read_op; - torch::Tensor _buffer; - int _fd; - const std::string _filename; - const long long int _num_bytes; - torch::Tensor _cpu_buffer; - torch::Tensor _contiguous_buffer; - const bool _validate; - - io_op_desc_t(const bool read_op, - const torch::Tensor& buffer, - const int fd, - const char* filename, - const long long int num_bytes, - const bool validate); - - char* data_ptr() const; - void fini(); -}; +#include "deepspeed_cpu_op.h" +#include "deepspeed_gds_op.h" struct thread_sync_t { std::mutex _mutex; diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp new file mode 100644 index 000000000000..6a1696598ed8 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include "deepspeed_cpu_op.h" + +using namespace std; + +cpu_op_desc_t::cpu_op_desc_t(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate) + : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate), + _cpu_buffer(buffer) +{ + if (_buffer.is_cuda()) { + if (_read_op) { + auto options = torch::TensorOptions() + .dtype(_buffer.dtype()) + .layout(_buffer.layout()) + .device(torch::kCPU); + _cpu_buffer = torch::empty(_buffer.nbytes(), options).pin_memory(); + } else { + _cpu_buffer = _buffer.to(torch::kCPU).pin_memory(); + } + } + + _contiguous_buffer = _cpu_buffer.contiguous(); +} + +char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } + +void cpu_op_desc_t::fini() +{ + if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } +} + +void cpu_op_desc_t::validate() +{ + validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes); +} + +void cpu_op_desc_t::run(const int tid, + std::unique_ptr& aio_ctxt, + deepspeed_aio_config_t* aio_config) +{ + assert(tid < _num_threads); + const auto base_offset = _num_bytes_per_thread * tid; + + std::unique_ptr xfer_ctxt( + new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr())); + + if (aio_config->_overlap_events) { + do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr); + } else { + do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr); + } +} diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h new file mode 100644 index 000000000000..d61fe4f3c545 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_cpu_op.h @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include +#include +#include "deepspeed_aio_op_desc.h" + +struct cpu_op_desc_t : io_op_desc_t { + torch::Tensor _cpu_buffer; + + cpu_op_desc_t(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate); + + void run(const int tid, + std::unique_ptr& aio_ctxt, + deepspeed_aio_config_t* aio_config); + + char* data_ptr() const; + + void validate(); + + void fini(); +}; diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp new file mode 100644 index 000000000000..077f0be84c8e --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp @@ -0,0 +1,162 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +/* +Functionality for swapping optimizer tensors to/from (NVMe) storage devices. +*/ + +#include "deepspeed_gds_op.h" +#include +#include + +using namespace std; + +// For when there is more than 1 device +// static std::set base_buffer_registry; +static std::map> base_ptr_registry; + +void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle) +{ + memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t)); + cf_descr.handle.fd = fd; + cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; + CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr); + if (status.err != CU_FILE_SUCCESS) { + std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl; + close(fd); + exit(EXIT_FAILURE); + } +} + +gds_op_desc_t::gds_op_desc_t(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate) + : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate) +{ + // assert(_buffer.is_cuda()); + _contiguous_buffer = _buffer.contiguous(); + + const int64_t device = _buffer.get_device(); + + char * buf_ptr = (char *)_contiguous_buffer.data_ptr(); + int64_t last = -1; + int64_t ptr_diff; + for (const auto& value : base_ptr_registry[device]) { + ptr_diff = buf_ptr - (char *)value; + if (last == -1 && ptr_diff >= 0) { + last = ptr_diff; + _base_ptr = value; + } + else if ( ptr_diff < last && ptr_diff >= 0) { + last = ptr_diff; + _base_ptr = value; + } + } + if (_contiguous_buffer.data_ptr() < _base_ptr) { + std::cerr << "BASE PTR ERROR :" << _base_ptr << " BUF PTR " << _contiguous_buffer.data_ptr() << std::endl; + for (const auto& value : base_ptr_registry[device]) { + std::cerr << "BASE PTR AVAIL :" << value << std::endl; + } + exit(EXIT_FAILURE); + } + // _base_ptr = _contiguous_buffer.data_ptr(); + + check_cudaruntimecall(cudaSetDevice(device)); + + _safe_handle_register(fd, _cf_descr, _cf_handle); + +} + +char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } + +void gds_op_desc_t::fini() +{ + //check_cuFileCall(cuFileBufDeregister(_buffer.data_ptr()), "file buffer deregister"); + cuFileHandleDeregister(_cf_handle); +} + +void gds_op_desc_t::validate() +{ + + check_cudaruntimecall(cudaSetDevice(_buffer.get_device())); + const auto cpu_buffer = _buffer.to(torch::kCPU); + validate_aio_operation( + _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes); +} + +void gds_op_desc_t::run(const int tid, + std::unique_ptr& aio_ctxt, + deepspeed_aio_config_t* aio_config) +{ + assert(tid < _num_threads); + check_cudaruntimecall(cudaSetDevice(_buffer.get_device())); + int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char *)_base_ptr; + const auto file_offset = _num_bytes_per_thread * tid; + + if (_read_op) { + auto ret = cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); + if (ret < 0) { _report_error(ret, errno, buf_offset); } + } else { + auto ret = cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); + if (ret < 0) { _report_error(ret, errno, buf_offset); } + } +} + +void gds_op_desc_t::_report_error(const ssize_t return_code, + const int error_num, + const off_t offset) +{ + const auto op_string = _read_op ? "read failed with " : "write failed with "; + const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: "; + const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code) + : cuFileGetErrorString(error_num); + std::cerr << op_string << error_string << error_code << " return code = " << return_code + << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread + << " offset = " << offset << std::endl; + exit(EXIT_FAILURE); +} + +int register_buffer(const torch::Tensor& buffer) +{ + const int64_t device = buffer.get_device(); + void * reg_ptr = buffer.data_ptr(); + + // std::cout << "REG PTR " << reg_ptr << std::endl; + // TODO: add checking to make sure pointer isn't already in set + const auto it = base_ptr_registry.find(device); + if (it == base_ptr_registry.end()) { + std::set new_ptr_set; + new_ptr_set.insert(reg_ptr); + base_ptr_registry.insert(std::pair>(device, new_ptr_set)); + } else { + base_ptr_registry[device].insert(reg_ptr); + } + + check_cudaruntimecall(cudaSetDevice(device)); + CUfileError_t status = cuFileBufRegister(reg_ptr, buffer.nbytes(), 0); + if (status.err != CU_FILE_SUCCESS) { + std::cerr << "buffer register failed:" << cuFileGetErrorString(status) << std::endl; + exit(EXIT_FAILURE); + } + return 0; +} + +int deregister_buffer(const torch::Tensor& buffer) +{ + const int64_t device = buffer.get_device(); + void * reg_ptr = buffer.data_ptr(); + + // std::cout << "DEREG PTR " << reg_ptr << std::endl; + check_cudaruntimecall(cudaSetDevice(device)); + cuFileBufDeregister(reg_ptr); + + // Remove from tracked registry + base_ptr_registry[device].erase(reg_ptr); + return 0; +} diff --git a/csrc/aio/py_lib/deepspeed_gds_op.h b/csrc/aio/py_lib/deepspeed_gds_op.h new file mode 100644 index 000000000000..21f466ecac12 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_gds_op.h @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include +#include + +#include "deepspeed_aio_op_desc.h" +#include "deepspeed_gds_utils.h" + +struct gds_op_desc_t : io_op_desc_t { + CUfileDescr_t _cf_descr; + CUfileHandle_t _cf_handle; + void* _base_ptr; + + gds_op_desc_t(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate); + + void run(const int tid, + std::unique_ptr& aio_ctxt, + deepspeed_aio_config_t* aio_config); + + char* data_ptr() const; + + void validate(); + + void fini(); + + void _read_file(const int tid); + + void _write_file(const int tid); + + void _report_error(const ssize_t return_code, const int error_num, const off_t offset); +}; + +int register_buffer(const torch::Tensor& buffer); + +int deregister_buffer(const torch::Tensor& buffer); diff --git a/csrc/aio/py_lib/deepspeed_gds_utils.h b/csrc/aio/py_lib/deepspeed_gds_utils.h new file mode 100644 index 000000000000..12b014d90988 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_gds_utils.h @@ -0,0 +1,91 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include + +// CUDA/cuFile includes +#include +#include +#include "cufile.h" + +// Macro for checking cuda errors following a cuda launch or api call +#define cudaCheckError() \ + { \ + cudaError_t e = cudaGetLastError(); \ + if (e != cudaSuccess) { \ + printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define check_cudadrivercall(fn) \ + do { \ + CUresult res = fn; \ + if (res != CUDA_SUCCESS) { \ + const char* str = nullptr; \ + cuGetErrorName(res, &str); \ + std::cerr << "cuda driver api call failed " << #fn << " res : " << res << ", " \ + << __LINE__ << ":" << str << std::endl; \ + std::cerr << "EXITING program!!!" << std::endl; \ + exit(1); \ + } \ + } while (0) + +#define check_cudaruntimecall(fn) \ + do { \ + cudaError_t res = fn; \ + if (res != cudaSuccess) { \ + const char* str = cudaGetErrorName(res); \ + std::cerr << "cuda runtime api call failed " << #fn << __LINE__ << ":" << str \ + << std::endl; \ + std::cerr << "EXITING program!!!" << std::endl; \ + exit(1); \ + } \ + } while (0) + +#define check_cuFileCall(fn, api_msg) \ + do { \ + CUfileError_t status = fn; \ + if (status.err != CU_FILE_SUCCESS) { \ + std::cout << api_msg << " failed with error " << CUFILE_ERRSTR(status.err) \ + << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// +// cuda driver error description +// +static inline const char* GetCuErrorString(CUresult curesult) +{ + const char* descp; + if (cuGetErrorName(curesult, &descp) != CUDA_SUCCESS) descp = "unknown cuda error"; + return descp; +} + +// +// cuFile APIs return both cuFile specific error codes as well as POSIX error codes +// for ease, the below template can be used for getting the error description depending +// on its type. + +// POSIX +template ::value, std::nullptr_t>::type = nullptr> +std::string cuFileGetErrorString(T status) +{ + status = std::abs(status); + return IS_CUFILE_ERR(status) ? std::string(CUFILE_ERRSTR(status)) + : std::string(std::strerror(status)); +} + +// CUfileError_t +template ::value, std::nullptr_t>::type = nullptr> +std::string cuFileGetErrorString(T status) +{ + std::string errStr = cuFileGetErrorString(static_cast(status.err)); + if (IS_CUDA_ERR(status)) errStr.append(".").append(GetCuErrorString(status.cu_err)); + return errStr; +} diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp index 387b713f2bfc..30b6682ada72 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp @@ -4,9 +4,6 @@ // DeepSpeed Team /* -Copyright 2020 The Microsoft DeepSpeed Team -Licensed under the MIT license. - Functionality for swapping optimizer tensors to/from (NVMe) storage devices. */ diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h index 11d5225de9f1..ba794db5440d 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio.h +++ b/csrc/aio/py_lib/deepspeed_py_aio.h @@ -4,10 +4,7 @@ // DeepSpeed Team /* -Copyright 2020 The Microsoft DeepSpeed Team -Licensed under the MIT license. - -Functionality for swapping optimizer tensors to/from (NVMe) storage devices. +Functionality for swapping tensors to/from (NVMe) storage devices. */ #include diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index c21e92de9449..b4dc0534fd15 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -4,9 +4,6 @@ // DeepSpeed Team /* -Copyright 2020 The Microsoft DeepSpeed Team -Licensed under the MIT license. - Functionality for swapping optimizer tensors to/from (NVMe) storage devices. */ @@ -14,16 +11,38 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. using namespace std; +bool deepspeed_aio_handle_t::s_cuFile_init = false; + static void _start_aio_thread(std::shared_ptr ctxt) { ctxt->run(); } +static std::shared_ptr _create_io_op_desc(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate, + const bool use_gds) +{ + if (buffer.is_cuda() && use_gds) { + return std::make_shared( + read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); + } else { + return std::make_shared( + read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); + } +} + deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, const int queue_depth, const bool single_submit, const bool overlap_events, + const bool use_gds, const int num_threads) : _aio_ctxt(new aio_context(block_size, queue_depth)), _single_submit(single_submit), _overlap_events(overlap_events), + _use_gds(use_gds), _num_threads(num_threads), _aio_config(block_size, queue_depth, single_submit, overlap_events, false), _num_pending_ops(0), @@ -36,6 +55,12 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, for (auto& ctxt : _thread_contexts) { _threads.push_back(std::thread(_start_aio_thread, ctxt)); } + + if (!deepspeed_aio_handle_t::s_cuFile_init) { + cuFileDriverOpen(); + cudaCheckError(); + deepspeed_aio_handle_t::s_cuFile_init = true; + } } deepspeed_aio_handle_t::~deepspeed_aio_handle_t() @@ -58,6 +83,8 @@ const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_su const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; } +const bool deepspeed_aio_handle_t::get_use_gds() const { return _use_gds; } + const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; } int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate) @@ -179,16 +206,12 @@ int deepspeed_aio_handle_t::wait() while (_num_pending_ops > 0) { auto completed_op = _wait_for_aio_work(); + if (completed_op->_validate) { completed_op->validate(); } + completed_op->fini(); close(completed_op->_fd); - if (completed_op->_validate) { - validate_aio_operation(completed_op->_read_op, - completed_op->_filename.c_str(), - completed_op->data_ptr(), - _num_threads * completed_op->_num_bytes); - } --_num_pending_ops; ++num_completed_ops; } @@ -201,7 +224,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op, { const auto op_string = read_op ? "Read" : "Write"; if (num_bytes % get_thread_count()) { - std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes + std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes << " not divisible by thread count = " << get_thread_count() << std::endl; return false; } @@ -233,8 +256,8 @@ int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer, const auto fd = open_file(filename, true); if (fd == -1) { return -1; } - auto scheduled_op = std::make_shared( - true, buffer, fd, filename, (num_file_bytes / _num_threads), validate); + auto scheduled_op = _create_io_op_desc( + true, buffer, fd, filename, num_file_bytes, _num_threads, validate, _use_gds); _schedule_aio_work(scheduled_op); @@ -248,6 +271,7 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer, const bool validate, const bool async) { + const auto num_write_bytes = static_cast(buffer.nbytes()); assert((num_write_bytes % _num_threads) == 0); @@ -256,8 +280,8 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer, const auto fd = open_file(filename, false); if (fd == -1) { return -1; } - auto scheduled_op = std::make_shared( - false, buffer, fd, filename, (num_write_bytes / _num_threads), validate); + auto scheduled_op = _create_io_op_desc( + false, buffer, fd, filename, num_write_bytes, _num_threads, validate, _use_gds); _schedule_aio_work(scheduled_op); @@ -296,3 +320,13 @@ bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor { return _pinned_tensor_mgr->free(locked_tensor); } + +int deepspeed_aio_handle_t::new_device_locked_tensor(const torch::Tensor& buffer) +{ + return register_buffer(buffer); +} + +int deepspeed_aio_handle_t::free_device_locked_tensor(const torch::Tensor& buffer) +{ + return deregister_buffer(buffer); +} diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h index 3a254c3814a2..db11a81426b6 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h @@ -16,8 +16,10 @@ struct deepspeed_aio_handle_t { std::unique_ptr _aio_ctxt; const bool _single_submit; const bool _overlap_events; + const bool _use_gds; const int _num_threads; deepspeed_aio_config_t _aio_config; + static bool s_cuFile_init; std::vector> _thread_contexts; std::vector _threads; @@ -28,6 +30,7 @@ struct deepspeed_aio_handle_t { const int queue_depth, const bool single_submit, const bool overlap_events, + const bool use_gds, const int num_threads); ~deepspeed_aio_handle_t(); @@ -36,6 +39,7 @@ struct deepspeed_aio_handle_t { const int get_queue_depth() const; const bool get_single_submit() const; const bool get_overlap_events() const; + const bool get_use_gds() const; const int get_thread_count() const; int read(torch::Tensor& buffer, const char* filename, const bool validate); @@ -65,6 +69,10 @@ struct deepspeed_aio_handle_t { bool free_cpu_locked_tensor(torch::Tensor&); + int new_device_locked_tensor(const torch::Tensor& example_tensor); + + int free_device_locked_tensor(const torch::Tensor& example_tensor); + int wait(); void _stop_threads(); diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp index 8a59107dd347..561c46f7c287 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.cpp +++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp @@ -4,7 +4,7 @@ // DeepSpeed Team /* -Functionality for swapping optimizer tensors to/from (NVMe) storage devices. +Functionality for swapping tensors to/from (NVMe) storage devices. */ #include "deepspeed_py_copy.h" diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h index 19ba28317d00..f443571a3e7b 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.h +++ b/csrc/aio/py_lib/deepspeed_py_copy.h @@ -4,9 +4,6 @@ // DeepSpeed Team /* -Copyright 2020 The Microsoft DeepSpeed Team -Licensed under the MIT license. - Functionality for swapping optimizer tensors to/from (NVMe) storage devices. */ diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp index 9033549bc0d2..8e1736013d6e 100755 --- a/csrc/aio/py_lib/py_ds_aio.cpp +++ b/csrc/aio/py_lib/py_ds_aio.cpp @@ -10,6 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. #include #include "deepspeed_py_aio_handle.h" #include "deepspeed_py_copy.h" +using namespace pybind11::literals; PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { @@ -20,12 +21,20 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy"); py::class_(m, "aio_handle") - .def(py::init()) + .def(py::init(), + "AIO handle constructor", + "block_size"_a, + "queue_depth"_a, + "single_submit"_a, + "overlap_events"_a, + "use_gds"_a, + "num_threads"_a) .def("get_block_size", &deepspeed_aio_handle_t::get_block_size) .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth) .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit) .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events) + .def("get_use_gds", &deepspeed_aio_handle_t::get_use_gds) .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count) .def("read", &deepspeed_aio_handle_t::read) @@ -41,6 +50,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor) .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor) + .def("new_device_locked_tensor", &deepspeed_aio_handle_t::new_device_locked_tensor) + .def("free_device_locked_tensor", &deepspeed_aio_handle_t::free_device_locked_tensor) .def("wait", &deepspeed_aio_handle_t::wait); } diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py index 66d843d68ea2..0e018063b10a 100644 --- a/csrc/aio/py_test/ds_aio_args.py +++ b/csrc/aio/py_test/ds_aio_args.py @@ -83,6 +83,11 @@ def validate_args(args): no_error = no_error and no_mapping_error error_messages += mapping_error_messages + # Validate --gpu, --use_gds + if args.use_gds and not args.gpu: + error_messages.append(f'--gpu must be set to transfer with --use_gds') + no_error = False + if not no_error: print(f'Found {len(error_messages)} validation errors') for i, msg in enumerate(error_messages): @@ -141,6 +146,8 @@ def parse_arguments(): parser.add_argument('--gpu', action='store_true', help='Use GPU memory') + parser.add_argument('--use_gds', action='store_true', help='Enable GDS AIO') + parser.add_argument('--slow_bounce_buffer', action='store_true', help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.') diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 369cb9d4030f..a7600a033002 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -20,6 +20,7 @@ def pre_handle(args, tid, read_op): io_string = "Read" if read_op else "Write" + gds = True if args.use_gds else False device_id, folder = args.mapping_list[tid] filename = create_filename(folder, args.read, args.io_size, tid) if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size): @@ -30,7 +31,7 @@ def pre_handle(args, tid, read_op): if args.gpu: device_name = get_accelerator().device_name(device_id) buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name) - if not args.slow_bounce_buffer: + if not (args.slow_bounce_buffer or gds): bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory() else: @@ -41,7 +42,9 @@ def pre_handle(args, tid, read_op): io_parallel = args.io_parallel if args.io_parallel else 1 handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, - not args.sequential_requests, io_parallel) + not args.sequential_requests, gds,io_parallel) + if gds: + handle.new_device_locked_tensor(buffer) task_log(tid, f'created deepspeed aio handle') ctxt = {} diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 56c81f41eb70..2590ad92bd27 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -24,6 +24,7 @@ IO_SIZE=$1 LOG_DIR=$2/aio_perf_sweep MAP_DIR=$2/aio GPU_MEM=$3 +USE_GDS=$4 RUN_SCRIPT=./test_ds_aio.py READ_OPT="--read" @@ -38,6 +39,11 @@ if [[ ${GPU_MEM} == "gpu" ]]; then else gpu_opt="" fi +if [[ ${USE_GDS} == "gds" ]]; then + gds_opt="--use_gds" +else + gds_opt="" +fi DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' " SYNC="sync" @@ -58,7 +64,7 @@ for sub in single block; do for t in 1 2 4 8; do for d in 16 32 64; do for bs in 256K 512K 1M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}" + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}" OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh index d8abc6869c50..544be4e5a0c2 100755 --- a/csrc/aio/py_test/run_write_sweep.sh +++ b/csrc/aio/py_test/run_write_sweep.sh @@ -25,15 +25,11 @@ function validate_environment() validate_environment -if [[ $# -ne 2 ]]; then - echo "Usage: $0 " - exit 1 -fi - IO_SIZE=$1 LOG_DIR=$2/aio_perf_sweep MAP_DIR=$2/aio GPU_MEM=$3 +USE_GDS=$4 RUN_SCRIPT=./test_ds_aio.py OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt @@ -49,6 +45,11 @@ if [[ ${GPU_MEM} == "gpu" ]]; then else gpu_opt="" fi +if [[ ${USE_GDS} == "gds" ]]; then + gds_opt="--use_gds" +else + gds_opt="" +fi DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' " SYNC="sync" @@ -69,7 +70,7 @@ for sub in single block; do for t in 1 2 4 8; do for d in 16 32 64; do for bs in 256K 512K 1M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}" + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt}--folder ${MAP_DIR}" OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" diff --git a/op_builder/async_io.py b/op_builder/async_io.py index b55c821910b9..e998daa2c376 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -3,13 +3,14 @@ # DeepSpeed Team +import os import distutils.spawn import subprocess -from .builder import OpBuilder +from .builder import TorchCPUOpBuilder -class AsyncIOBuilder(OpBuilder): +class AsyncIOBuilder(TorchCPUOpBuilder): BUILD_VAR = "DS_BUILD_AIO" NAME = "async_io" @@ -25,38 +26,45 @@ def sources(self): 'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp', - 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' + 'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_gds_op.cpp', + 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' ] def include_paths(self): - return ['csrc/aio/py_lib', 'csrc/aio/common'] + import torch + if self.build_for_cpu: + CUDA_INCLUDE = [] + elif not self.is_rocm_pytorch(): + CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] + else: + CUDA_INCLUDE = [ + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"), + ] + return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE def cxx_args(self): # -O0 for improved debugging, since performance is bound by I/O - CPU_ARCH = self.cpu_arch() - SIMD_WIDTH = self.simd_width() - import torch # Keep this import here to avoid errors when building DeepSpeed wheel without torch installed - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2]) - if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1: - CPP_STD = '-std=c++17' - else: - CPP_STD = '-std=c++14' - return [ - '-g', + args = super().cxx_args() + args += [ '-Wall', '-O0', - CPP_STD, '-shared', '-fPIC', '-Wno-reorder', - CPU_ARCH, - '-fopenmp', - SIMD_WIDTH, - '-laio', ] + return args + def extra_ldflags(self): - return ['-laio'] + if self.build_for_cpu: + return ['-fopenmp'] + + import torch.utils.cpp_extension + CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME + CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64") + return [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart', '-lcufile'] def check_for_libaio_pkg(self): libs = dict( @@ -85,7 +93,7 @@ def is_compatible(self, verbose=True): # which is a function provided by libaio that is used in the async_io op. # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS # respectively to specify the directories for libaio.h and libaio.so. - aio_compatible = self.has_function('io_pgetevents', ('aio', )) + aio_compatible = self.has_function('io_submit', ('aio', )) if verbose and not aio_compatible: self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.") From 084e03e83b246c6b5621cebd5fb1690d340cbc0f Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 23 May 2024 23:05:29 +0000 Subject: [PATCH 04/31] setting gds block size --- csrc/aio/py_lib/deepspeed_gds_op.cpp | 2 -- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 6 ++++++ csrc/aio/py_test/ds_aio_handle.py | 4 +++- csrc/aio/py_test/run_read_sweep.sh | 4 ++-- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp index 077f0be84c8e..8b4f8be6e22f 100644 --- a/csrc/aio/py_lib/deepspeed_gds_op.cpp +++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp @@ -65,8 +65,6 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op, } exit(EXIT_FAILURE); } - // _base_ptr = _contiguous_buffer.data_ptr(); - check_cudaruntimecall(cudaSetDevice(device)); _safe_handle_register(fd, _cf_descr, _cf_handle); diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index b4dc0534fd15..f2b90919ef15 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -59,6 +59,12 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, if (!deepspeed_aio_handle_t::s_cuFile_init) { cuFileDriverOpen(); cudaCheckError(); + size_t direct_io_size = (size_t)block_size / 1024; + CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size); + if (status.err != CU_FILE_SUCCESS) { + std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl; + exit(EXIT_FAILURE); + } deepspeed_aio_handle_t::s_cuFile_init = true; } } diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index a7600a033002..9e55ae6cacf1 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -51,6 +51,7 @@ def pre_handle(args, tid, read_op): ctxt['file'] = filename ctxt['num_bytes'] = args.io_size ctxt['handle'] = handle + ctxt['gds'] = gds ctxt[BUFFER] = buffer ctxt[BOUNCE_BUFFER] = bounce_buffer ctxt['elapsed_sec'] = 0 @@ -74,6 +75,8 @@ def post_handle(pool_params): _, _, ctxt = pool_params for buf in [BUFFER, BOUNCE_BUFFER]: if ctxt[buf] is not None: + if ctxt['gds']: + ctxt['handle'].free_device_locked_tensor(ctxt[buf]) ctxt[buf].detach() ctxt[buf] = None return ctxt @@ -92,7 +95,6 @@ def main_parallel_read(pool_params): ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time - return ctxt diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 2590ad92bd27..0f26d718afac 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -62,7 +62,7 @@ for sub in single block; do fi for p in 1 2 4 8; do for t in 1 2 4 8; do - for d in 16 32 64; do + for d in 32 64 128; do for bs in 256K 512K 1M; do SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}" OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" @@ -75,7 +75,7 @@ for sub in single block; do eval ${DISABLE_CACHE} eval ${cmd} eval ${SYNC} - sleep 1 + sleep 5 done done done From cef9af08552edeac8f894de21b1c35c0e3e8e668 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Tue, 28 May 2024 21:24:49 +0000 Subject: [PATCH 05/31] gds working w/threads --- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 36 ++++++++++++++++----- csrc/aio/py_lib/deepspeed_py_aio_handle.h | 2 +- csrc/aio/py_test/ds_aio_args.py | 2 +- csrc/aio/py_test/run_read_sweep.sh | 27 ++++++++++++---- 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index f2b90919ef15..6bd2f6385c7f 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -8,6 +8,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. */ #include "deepspeed_py_aio_handle.h" +#include +#include +#include using namespace std; @@ -48,15 +51,20 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, _num_pending_ops(0), _pinned_tensor_mgr(new deepspeed_pin_tensor_t()) { - for (auto i = 0; i < num_threads; ++i) { - _thread_contexts.push_back(std::make_shared(i, _aio_config)); - } - - for (auto& ctxt : _thread_contexts) { - _threads.push_back(std::thread(_start_aio_thread, ctxt)); - } - if (!deepspeed_aio_handle_t::s_cuFile_init) { + if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) { + std::string depthStr = std::to_string(queue_depth); + std::string threadsStr = std::to_string(num_threads); + std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", "; + std::string json2 = R"("max_request_parallelism": )"+threadsStr+", "; + std::string json3 = R"("max_io_threads": )"+threadsStr+", "; + std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})"; + std::ofstream outFile("local_cufile.json"); + if (outFile.is_open()){ + outFile << json1 + json2 + json3 + json4; + outFile.close(); + } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);} + putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json"); cuFileDriverOpen(); cudaCheckError(); size_t direct_io_size = (size_t)block_size / 1024; @@ -66,6 +74,17 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, exit(EXIT_FAILURE); } deepspeed_aio_handle_t::s_cuFile_init = true; + // GDS threads handled internally + _thread_contexts.push_back(std::make_shared(0, _aio_config)); + _num_threads = 1; + } else { // CPU OP + for (auto i = 0; i < num_threads; ++i) { + _thread_contexts.push_back(std::make_shared(i, _aio_config)); + } + } + + for (auto& ctxt : _thread_contexts) { + _threads.push_back(std::thread(_start_aio_thread, ctxt)); } } @@ -73,6 +92,7 @@ deepspeed_aio_handle_t::~deepspeed_aio_handle_t() { _stop_threads(); for (auto& thr : _threads) { thr.join(); } + if (_use_gds) {cuFileDriverClose();} } const int deepspeed_aio_handle_t::get_block_size() const diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h index db11a81426b6..bc3f6818d402 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h @@ -17,7 +17,7 @@ struct deepspeed_aio_handle_t { const bool _single_submit; const bool _overlap_events; const bool _use_gds; - const int _num_threads; + int _num_threads; deepspeed_aio_config_t _aio_config; static bool s_cuFile_init; diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py index 0e018063b10a..5fc3098d3357 100644 --- a/csrc/aio/py_test/ds_aio_args.py +++ b/csrc/aio/py_test/ds_aio_args.py @@ -140,7 +140,7 @@ def parse_arguments(): parser.add_argument('--handle', action='store_true', help='Use AIO handle.') - parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions') + parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions') parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism') diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 0f26d718afac..83afe291ec7e 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -17,6 +17,14 @@ function validate_environment() fi } +function fileExists() { + local file="$1" + if [[ -f "$file" ]]; then + return 0 + else + return 1 + fi +} validate_environment @@ -68,14 +76,19 @@ for sub in single block; do OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" - echo ${DISABLE_CACHE} - echo ${cmd} - echo ${SYNC} - eval ${DISABLE_CACHE} - eval ${cmd} - eval ${SYNC} - sleep 5 + if fileExists ${LOG}; then + echo "Log Exists" + sleep 2 + else + echo ${DISABLE_CACHE} + echo ${cmd} + echo ${SYNC} + eval ${DISABLE_CACHE} + eval ${cmd} + eval ${SYNC} + sleep 2 + fi done done done From 86594a489f1268c7418bc1a12ea1c933e5988ac9 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 30 May 2024 19:20:39 +0000 Subject: [PATCH 06/31] keeping in case container delete --- csrc/aio/py_test/run_read_sweep.sh | 66 ++++++++++++++--------------- csrc/aio/py_test/run_write_sweep.sh | 6 +-- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 83afe291ec7e..ea2bceece148 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -4,6 +4,15 @@ if [[ $# -lt 2 ]]; then exit 1 fi +function prep_folder() +{ + folder=$1 + if [[ -d ${folder} ]]; then + rm -f ${folder}/* + else + mkdir -p ${folder} + fi +} function validate_environment() { @@ -36,6 +45,9 @@ USE_GDS=$4 RUN_SCRIPT=./test_ds_aio.py READ_OPT="--read" +prep_folder ${MAP_DIR} +prep_folder ${LOG_DIR} + if [[ -d ${LOG_DIR} ]]; then rm -f ${LOG_DIR}/* else @@ -55,43 +67,27 @@ fi DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' " SYNC="sync" +sub_opt="" +sub="block" +ov_opt="" +ov="overlap" +t=8 -for sub in single block; do - if [[ $sub == "single" ]]; then - sub_opt="--single_submit" - else - sub_opt="" - fi - for ov in overlap sequential; do - if [[ $ov == "sequential" ]]; then - ov_opt="--sequential_requests" - else - ov_opt="" - fi - for p in 1 2 4 8; do - for t in 1 2 4 8; do - for d in 32 64 128; do - for bs in 256K 512K 1M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}" - OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" - LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" - cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" +for p in 1 8; do + for d in 64 128; do + for bs in 8M 16M; do + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}" + OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" + LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" + cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" - if fileExists ${LOG}; then - echo "Log Exists" - sleep 2 - else - echo ${DISABLE_CACHE} - echo ${cmd} - echo ${SYNC} - eval ${DISABLE_CACHE} - eval ${cmd} - eval ${SYNC} - sleep 2 - fi - done - done - done + echo ${DISABLE_CACHE} + echo ${cmd} + echo ${SYNC} + eval ${DISABLE_CACHE} + eval ${cmd} + eval ${SYNC} + sleep 2 done done done diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh index 544be4e5a0c2..a54d1c8d7bed 100755 --- a/csrc/aio/py_test/run_write_sweep.sh +++ b/csrc/aio/py_test/run_write_sweep.sh @@ -68,9 +68,9 @@ for sub in single block; do fi for p in 1 2 4 8; do for t in 1 2 4 8; do - for d in 16 32 64; do + for d in 32 64 128; do for bs in 256K 512K 1M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt}--folder ${MAP_DIR}" + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}" OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" @@ -81,7 +81,7 @@ for sub in single block; do eval ${DISABLE_CACHE} eval ${cmd} eval ${SYNC} - sleep 1 + sleep 2 done done done From d13b1ab87004348e779a0db1d8d5ffb9cd6f89c6 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 30 May 2024 21:49:26 +0000 Subject: [PATCH 07/31] ftd in read sweep --- csrc/aio/py_test/run_read_sweep.sh | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index ea2bceece148..1036e6bdb4aa 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -72,22 +72,21 @@ sub="block" ov_opt="" ov="overlap" t=8 +p=8 -for p in 1 8; do - for d in 64 128; do - for bs in 8M 16M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}" - OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}" - LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" - cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" +for d in 64 128; do + for bs in 8M 16M; do + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /workspace/nvme03:0 /workspace/nvme03:1 /workspace/nvme03:2 /workspace/nvme03:3 /workspace/nvme47:4 /workspace/nvme47:5 /workspace/nvme47:6 /workspace/nvme47:7" + OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}" + LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" + cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" - echo ${DISABLE_CACHE} - echo ${cmd} - echo ${SYNC} - eval ${DISABLE_CACHE} - eval ${cmd} - eval ${SYNC} - sleep 2 - done + echo ${DISABLE_CACHE} + echo ${cmd} + echo ${SYNC} + eval ${DISABLE_CACHE} + eval ${cmd} + eval ${SYNC} + sleep 2 done done From 4d9c27e864f609e5d2d83f3fbd8dddbedc211563 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Wed, 3 Jul 2024 18:47:28 +0000 Subject: [PATCH 08/31] changes in master to make it run --- csrc/aio/py_test/run_read_sweep.sh | 18 +++++++++--------- deepspeed/elasticity/elastic_agent.py | 7 +++++-- requirements/requirements.txt | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 1036e6bdb4aa..f474791af668 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -16,7 +16,7 @@ function prep_folder() function validate_environment() { - validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python ./validate_async_io.py" + validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python3 ./validate_async_io.py" eval ${validate_cmd} res=$? if [[ $res != 0 ]]; then @@ -38,12 +38,12 @@ function fileExists() { validate_environment IO_SIZE=$1 -LOG_DIR=$2/aio_perf_sweep +LOG_DIR=./1nvme_cpu_write MAP_DIR=$2/aio GPU_MEM=$3 USE_GDS=$4 RUN_SCRIPT=./test_ds_aio.py -READ_OPT="--read" +READ_OPT="" prep_folder ${MAP_DIR} prep_folder ${LOG_DIR} @@ -65,21 +65,21 @@ else gds_opt="" fi -DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' " -SYNC="sync" +DISABLE_CACHE="sudo sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' " +SYNC="sudo sync" sub_opt="" sub="block" ov_opt="" ov="overlap" t=8 -p=8 +p=1 for d in 64 128; do for bs in 8M 16M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /workspace/nvme03:0 /workspace/nvme03:1 /workspace/nvme03:2 /workspace/nvme03:3 /workspace/nvme47:4 /workspace/nvme47:5 /workspace/nvme47:6 /workspace/nvme47:7" + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvmed0:0" OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}" - LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" - cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" + LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" + cmd="/usr/bin/time python3 ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" echo ${DISABLE_CACHE} echo ${cmd} diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py index c1e8932ecaba..d079ecd516e2 100644 --- a/deepspeed/elasticity/elastic_agent.py +++ b/deepspeed/elasticity/elastic_agent.py @@ -6,7 +6,7 @@ from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent from typing import Any, Dict, Optional, Tuple from datetime import datetime -from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port +from torch.distributed.elastic.utils.distributed import get_free_port from torch.distributed.elastic.metrics import put_metric from torch.distributed.elastic.agent.server.api import ( RunResult, @@ -24,6 +24,9 @@ from contextlib import closing import subprocess +from torch.distributed.elastic.utils.logging import get_logger + +log = get_logger(__name__) class DSElasticAgent(LocalElasticAgent): @@ -44,7 +47,7 @@ def _set_master_addr_port(store: Store, master_port: Optional[int], local_addr: Optional[str] = None): if master_port is None: - sock = _get_socket_with_port() + sock = get_free_port() with closing(sock): master_port = sock.getsockname()[1] diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 80c9f9b3287a..9b923d94f619 100755 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ hjson ninja -numpy +numpy<2.0.0 packaging>=20.0 psutil py-cpuinfo From 7c94fe8c0e6193b87b634d01d3ac0e6d14a64621 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Mon, 22 Jul 2024 23:09:17 +0000 Subject: [PATCH 09/31] compile without gds --- csrc/aio/py_lib/deepspeed_gds_op.cpp | 63 ++++++++++++++++++- csrc/aio/py_lib/deepspeed_gds_op.h | 16 ++++- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 27 +------- csrc/aio/py_test/run_read_sweep.sh | 68 ++++++++++++++------- 4 files changed, 124 insertions(+), 50 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp index 8b4f8be6e22f..34c7282cd897 100644 --- a/csrc/aio/py_lib/deepspeed_gds_op.cpp +++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp @@ -8,11 +8,36 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. */ #include "deepspeed_gds_op.h" -#include -#include using namespace std; +#ifdef __ENABLE_GDS__ +void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads) +{ + std::string depthStr = std::to_string(queue_depth); + std::string threadsStr = std::to_string(num_threads); + std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", "; + std::string json2 = R"("max_request_parallelism": )"+threadsStr+", "; + std::string json3 = R"("max_io_threads": )"+threadsStr+", "; + std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})"; + std::ofstream outFile("local_cufile.json"); + if (outFile.is_open()){ + outFile << json1 + json2 + json3 + json4; + outFile.close(); + } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);} + putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json"); + cuFileDriverOpen(); + cudaCheckError(); + size_t direct_io_size = (size_t)block_size / 1024; + CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size); + if (status.err != CU_FILE_SUCCESS) { + std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl; + exit(EXIT_FAILURE); + } +}; + +void close_gds() {cuFileDriverClose();} + // For when there is more than 1 device // static std::set base_buffer_registry; static std::map> base_ptr_registry; @@ -158,3 +183,37 @@ int deregister_buffer(const torch::Tensor& buffer) base_ptr_registry[device].erase(reg_ptr); return 0; } +#else +void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads) +{ + std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; + exit(EXIT_FAILURE); +}; +void close_gds() +{ + std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; + exit(EXIT_FAILURE); +}; +gds_op_desc_t::gds_op_desc_t(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate) + : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate) +{ + std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; + exit(EXIT_FAILURE); +}; +int register_buffer(const torch::Tensor& buffer) +{ + std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; + exit(EXIT_FAILURE); +}; +int deregister_buffer(const torch::Tensor& buffer) +{ + std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; + exit(EXIT_FAILURE); +}; +#endif diff --git a/csrc/aio/py_lib/deepspeed_gds_op.h b/csrc/aio/py_lib/deepspeed_gds_op.h index 21f466ecac12..1e955aa67558 100644 --- a/csrc/aio/py_lib/deepspeed_gds_op.h +++ b/csrc/aio/py_lib/deepspeed_gds_op.h @@ -5,13 +5,21 @@ #include #include +#include +#include +#include +#include #include "deepspeed_aio_op_desc.h" +#ifdef __ENABLE_GDS__ #include "deepspeed_gds_utils.h" +#endif struct gds_op_desc_t : io_op_desc_t { - CUfileDescr_t _cf_descr; - CUfileHandle_t _cf_handle; + #ifdef __ENABLE_GDS__ + CUfileDescr_t _cf_descr; + CUfileHandle_t _cf_handle; + #endif void* _base_ptr; gds_op_desc_t(const bool read_op, @@ -42,3 +50,7 @@ struct gds_op_desc_t : io_op_desc_t { int register_buffer(const torch::Tensor& buffer); int deregister_buffer(const torch::Tensor& buffer); + +void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads); + +void close_gds(); diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index 6bd2f6385c7f..d968f1f0b25a 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -8,8 +8,6 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. */ #include "deepspeed_py_aio_handle.h" -#include -#include #include using namespace std; @@ -53,29 +51,10 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, { if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) { - std::string depthStr = std::to_string(queue_depth); - std::string threadsStr = std::to_string(num_threads); - std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", "; - std::string json2 = R"("max_request_parallelism": )"+threadsStr+", "; - std::string json3 = R"("max_io_threads": )"+threadsStr+", "; - std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})"; - std::ofstream outFile("local_cufile.json"); - if (outFile.is_open()){ - outFile << json1 + json2 + json3 + json4; - outFile.close(); - } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);} - putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json"); - cuFileDriverOpen(); - cudaCheckError(); - size_t direct_io_size = (size_t)block_size / 1024; - CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size); - if (status.err != CU_FILE_SUCCESS) { - std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl; - exit(EXIT_FAILURE); - } + init_gds_cufile(block_size, queue_depth, num_threads); deepspeed_aio_handle_t::s_cuFile_init = true; - // GDS threads handled internally _thread_contexts.push_back(std::make_shared(0, _aio_config)); + // GDS threads handled in cufile.json _num_threads = 1; } else { // CPU OP for (auto i = 0; i < num_threads; ++i) { @@ -92,7 +71,7 @@ deepspeed_aio_handle_t::~deepspeed_aio_handle_t() { _stop_threads(); for (auto& thr : _threads) { thr.join(); } - if (_use_gds) {cuFileDriverClose();} + if (_use_gds) {close_gds();} } const int deepspeed_aio_handle_t::get_block_size() const diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index f474791af668..14fa0027e004 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -38,12 +38,12 @@ function fileExists() { validate_environment IO_SIZE=$1 -LOG_DIR=./1nvme_cpu_write +LOG_DIR=./aio_perf_sweep MAP_DIR=$2/aio GPU_MEM=$3 USE_GDS=$4 RUN_SCRIPT=./test_ds_aio.py -READ_OPT="" +READ_OPT="--read" prep_folder ${MAP_DIR} prep_folder ${LOG_DIR} @@ -67,26 +67,50 @@ fi DISABLE_CACHE="sudo sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' " SYNC="sudo sync" -sub_opt="" -sub="block" -ov_opt="" -ov="overlap" -t=8 -p=1 -for d in 64 128; do - for bs in 8M 16M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvmed0:0" - OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}" - LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" - cmd="/usr/bin/time python3 ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" - - echo ${DISABLE_CACHE} - echo ${cmd} - echo ${SYNC} - eval ${DISABLE_CACHE} - eval ${cmd} - eval ${SYNC} - sleep 2 +for xtype in cpu gpu gds; do + if [[ $xtype == "cpu" ]]; then + gpu_opt="" + gds_opt="" + elif [[ $xtype == "gpu" ]]; then + gpu_opt="--gpu" + gds_opt="" + else + gpu_opt="--gpu" + gds_opt="--use_gds" + fi + for sub in single block; do + if [[ $sub == "single" ]]; then + sub_opt="--single_submit" + else + sub_opt="" + fi + for ov in overlap sequential; do + if [[ $ov == "sequential" ]]; then + ov_opt="--sequential_requests" + else + ov_opt="" + fi + for p in 1 2 4 8; do + for t in 1 2 4 8; do + for d in 8 16 32 64 128; do + for bs in 128K 256K 512K 1M 2M 4M 8M 16M; do + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme03:0" + OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}" + LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" + cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" + + echo ${DISABLE_CACHE} + echo ${cmd} + echo ${SYNC} + eval ${DISABLE_CACHE} + eval ${cmd} + eval ${SYNC} + sleep 2 + done + done + done + done + done done done From f02ba09a502c4da4823263541edf0965b66869d6 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Tue, 23 Jul 2024 22:07:01 +0000 Subject: [PATCH 10/31] gds macro --- op_builder/async_io.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/op_builder/async_io.py b/op_builder/async_io.py index e998daa2c376..02f8bcc77faa 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -47,16 +47,21 @@ def include_paths(self): def cxx_args(self): # -O0 for improved debugging, since performance is bound by I/O args = super().cxx_args() + GDS_ENABLE = self.is_gds_enable() args += [ '-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder', + GDS_ENABLE ] return args + def is_gds_enable(): + return '-D__ENABLE_GDS__' + def extra_ldflags(self): if self.build_for_cpu: return ['-fopenmp'] From 7d3ac1002218cd6398cac5876902e4a8b35da9a3 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Wed, 24 Jul 2024 17:40:20 +0000 Subject: [PATCH 11/31] simple gds+cpu swapper integration --- csrc/aio/py_lib/deepspeed_gds_op.cpp | 1 + .../swap_tensor/partitioned_param_swapper.py | 26 +++++++++++++------ op_builder/async_io.py | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp index 34c7282cd897..06eb4f78c399 100644 --- a/csrc/aio/py_lib/deepspeed_gds_op.cpp +++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp @@ -36,6 +36,7 @@ void init_gds_cufile(const int block_size, const int queue_depth, const int num_ } }; +// TODO: deregister and release any held onto buffers void close_gds() {cuFileDriverClose();} // For when there is more than 1 device diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py index fcc6a272883f..6f09a687d98c 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py @@ -39,6 +39,7 @@ def __init__(self, ds_config, model_dtype): aio_op = AsyncIOBuilder().load(verbose=False) self.aio_handle = aio_op.aio_handle + self.use_gds = True self.dtype = model_dtype #set swap buffers, create aio handles @@ -104,19 +105,28 @@ def _configure_aio(self, ds_config): self.available_buffer_ids = [i for i in range(self.param_buffer_count)] self.reserved_buffer_ids = [] - self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer * - self.param_buffer_count), - dtype=self.dtype, - requires_grad=False), - align_bytes=0) self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH], - self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS], - self.aio_config[AIO_THREAD_COUNT]) + self.aio_config[AIO_SINGLE_SUBMIT], + self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT]) self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH], self.aio_config[AIO_SINGLE_SUBMIT], - self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT]) + self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT]) + + if self.use_gds: + self.buffers = torch.empty(int(self.aligned_elements_per_buffer * + self.param_buffer_count), + dtype=self.dtype, + device='cuda', # gotta be cuda + requires_grad=False) + self.aio_read_handle.new_device_locked_tensor(self.buffers) + else: + self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer * + self.param_buffer_count), + dtype=self.dtype, + requires_grad=False), + align_bytes=0) self.swap_out_params = [] diff --git a/op_builder/async_io.py b/op_builder/async_io.py index 02f8bcc77faa..a8620387d209 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -59,7 +59,7 @@ def cxx_args(self): return args - def is_gds_enable(): + def is_gds_enable(self): return '-D__ENABLE_GDS__' def extra_ldflags(self): From 67da243d53f9872eba9eee504075534f543ed31b Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 25 Jul 2024 19:35:25 +0000 Subject: [PATCH 12/31] working pytest --- csrc/aio/py_lib/deepspeed_gds_op.cpp | 2 +- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 4 +- tests/unit/ops/aio/test_aio.py | 70 ++++++++++++--------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp index 06eb4f78c399..207477ef455a 100644 --- a/csrc/aio/py_lib/deepspeed_gds_op.cpp +++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp @@ -36,7 +36,7 @@ void init_gds_cufile(const int block_size, const int queue_depth, const int num_ } }; -// TODO: deregister and release any held onto buffers +// TODO: deregister and release all buffers void close_gds() {cuFileDriverClose();} // For when there is more than 1 device diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index d968f1f0b25a..f13cd6dd06db 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -53,8 +53,10 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) { init_gds_cufile(block_size, queue_depth, num_threads); deepspeed_aio_handle_t::s_cuFile_init = true; - _thread_contexts.push_back(std::make_shared(0, _aio_config)); + } + if (use_gds) { // GDS threads handled in cufile.json + _thread_contexts.push_back(std::make_shared(0, _aio_config)); _num_threads = 1; } else { // CPU OP for (auto i = 0; i < num_threads; ++i) { diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py index f6d175ce67bc..eb6ddd4da8cb 100644 --- a/tests/unit/ops/aio/test_aio.py +++ b/tests/unit/ops/aio/test_aio.py @@ -13,22 +13,26 @@ from deepspeed.ops.op_builder import AsyncIOBuilder from unit.common import DistributedTest -KILO_BYTE = 1024 +KILO_BYTE = 1024*256 BLOCK_SIZE = KILO_BYTE QUEUE_DEPTH = 2 IO_SIZE = 4 * BLOCK_SIZE IO_PARALLEL = 2 +GDS_ENABLE=True if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]: pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True) -def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True): +def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True, use_gds=False): if not get_accelerator().is_available(): if use_cuda_device: pytest.skip("GPU tensors only supported in CUDA environments.") if use_cuda_pinned_tensor: pytest.skip("CUDA-pinned tensors only supported in CUDA environments.") + if not GDS_ENABLE and use_gds: + pytest.skip("GDS not available, won't run GDS case.") + def _get_local_rank(): @@ -58,7 +62,6 @@ def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0): return test_file, test_buffer -def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0): test_file = _get_test_write_file(tmpdir, index) if aio_handle is None: test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer))) @@ -70,17 +73,19 @@ def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, ind return test_file, test_buffer -def _validate_handle_state(handle, single_submit, overlap_events): +def _validate_handle_state(handle, single_submit, overlap_events, use_gds): assert handle.get_single_submit() == single_submit assert handle.get_overlap_events() == overlap_events - assert handle.get_thread_count() == IO_PARALLEL + if use_gds: + assert handle.get_thread_count() == 1 + else: + assert handle.get_thread_count() == IO_PARALLEL assert handle.get_block_size() == BLOCK_SIZE assert handle.get_queue_depth() == QUEUE_DEPTH - -@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False]) -@pytest.mark.parametrize("single_submit", [True, False]) -@pytest.mark.parametrize("overlap_events", [True, False]) +@pytest.mark.parametrize("single_submit", [True,False]) +@pytest.mark.parametrize("overlap_events", [True,False]) +@pytest.mark.parametrize("use_cuda_pinned_tensor, use_gds", [(False,False),(True,False),(False,True)]) class TestRead(DistributedTest): world_size = 1 reuse_dist_env = True @@ -89,17 +94,20 @@ class TestRead(DistributedTest): init_distributed = False set_dist_env = False - def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events): - _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_parallel_read(self, tmpdir, single_submit, overlap_events, use_cuda_pinned_tensor, use_gds): + _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) if use_cuda_pinned_tensor: aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu')) + elif use_gds: + aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda') + h.new_device_locked_tensor(aio_buffer) else: aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8)) - _validate_handle_state(h, single_submit, overlap_events) + _validate_handle_state(h, single_submit, overlap_events, use_gds) ref_file, _ = _do_ref_write(tmpdir) read_status = h.sync_pread(aio_buffer, ref_file) @@ -109,15 +117,17 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over ref_buffer = list(f.read()) assert ref_buffer == aio_buffer.tolist() - if not use_cuda_pinned_tensor: + if use_gds: + h.free_device_locked_tensor(aio_buffer) + elif not use_cuda_pinned_tensor: h.free_cpu_locked_tensor(aio_buffer) @pytest.mark.parametrize("cuda_device", [True, False]) - def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) use_cpu_locked_tensor = False - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) if cuda_device: aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) @@ -147,6 +157,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap @pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False]) @pytest.mark.parametrize("single_submit", [True, False]) @pytest.mark.parametrize("overlap_events", [True, False]) +@pytest.mark.parametrize("use_gds", [False]) class TestWrite(DistributedTest): world_size = 1 reuse_dist_env = True @@ -155,11 +166,11 @@ class TestWrite(DistributedTest): init_distributed = False set_dist_env = False - def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events): - _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds): + _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) ref_file, ref_buffer = _do_ref_write(tmpdir) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) if use_cuda_pinned_tensor: aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer) @@ -180,12 +191,12 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove assert filecmp.cmp(ref_file, aio_file, shallow=False) @pytest.mark.parametrize("cuda_device", [True, False]) - def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) ref_file, ref_buffer = _do_ref_write(tmpdir) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) use_cpu_locked_tensor = False if cuda_device: aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer) @@ -215,6 +226,7 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla @pytest.mark.sequential @pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False]) @pytest.mark.parametrize("cuda_device", [True, False]) +@pytest.mark.parametrize("use_gds", [False]) class TestAsyncQueue(DistributedTest): world_size = 1 requires_cuda_env = False @@ -223,8 +235,8 @@ class TestAsyncQueue(DistributedTest): set_dist_env = False @pytest.mark.parametrize("async_queue", [2, 3]) - def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, use_gds): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) ref_files = [] for i in range(async_queue): @@ -233,7 +245,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device): single_submit = True overlap_events = True - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) use_cpu_locked_tensor = False if cuda_device: @@ -270,8 +282,8 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device): h.free_cpu_locked_tensor(t) @pytest.mark.parametrize("async_queue", [2, 3]) - def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device, use_gds): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) ref_files = [] ref_buffers = [] @@ -282,7 +294,7 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device): single_submit = True overlap_events = True - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, useg_gds,IO_PARALLEL) aio_files = [] aio_buffers = [] From 048729db9ec6a820afdc4005686453d281d4b716 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 5 Aug 2024 18:27:42 -0400 Subject: [PATCH 13/31] Make GDS first class operator (#640) * Simplify GDS integration * Make GDS first class op * GDS op cleanup * gds_handle correctness * Fix unit tests * Fix gds bug in param_swapper --------- Co-authored-by: Joe Mayer --- csrc/aio/py_lib/deepspeed_aio_thread.h | 1 - csrc/aio/py_lib/deepspeed_cpu_op.cpp | 9 +- csrc/aio/py_lib/deepspeed_cpu_op.h | 1 + csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 72 ++---- csrc/aio/py_lib/deepspeed_py_aio_handle.h | 17 +- csrc/aio/py_lib/deepspeed_py_copy.cpp | 2 +- csrc/aio/py_lib/py_ds_aio.cpp | 6 +- csrc/aio/py_test/ds_aio_handle.py | 2 +- csrc/aio/py_test/run_read_sweep.sh | 2 +- ...cated_tile_access_iterator_residual_last.h | 8 +- csrc/{aio => gds}/py_lib/deepspeed_gds_op.cpp | 201 +++++---------- csrc/{aio => gds}/py_lib/deepspeed_gds_op.h | 28 +- .../{aio => gds}/py_lib/deepspeed_gds_utils.h | 0 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 95 +++++++ csrc/gds/py_lib/deepspeed_py_gds_handle.h | 40 +++ csrc/gds/py_lib/py_ds_gds.cpp | 48 ++++ csrc/gds/py_test/validate_gds.py | 10 + csrc/includes/simd.h | 2 +- csrc/xpu/includes/simd.h | 2 +- csrc/xpu/includes/type_shim.h | 10 +- deepspeed/elasticity/elastic_agent.py | 1 + deepspeed/ops/gds/__init__.py | 6 + deepspeed/runtime/swap_tensor/aio_config.py | 14 +- deepspeed/runtime/swap_tensor/constants.py | 5 +- .../swap_tensor/partitioned_param_swapper.py | 19 +- op_builder/async_io.py | 30 +-- op_builder/builder.py | 5 +- op_builder/gds.py | 50 ++++ tests/unit/ops/aio/test_aio.py | 74 +++--- tests/unit/ops/aio/test_gds.py | 244 ++++++++++++++++++ 30 files changed, 690 insertions(+), 314 deletions(-) rename csrc/{aio => gds}/py_lib/deepspeed_gds_op.cpp (54%) rename csrc/{aio => gds}/py_lib/deepspeed_gds_op.h (68%) rename csrc/{aio => gds}/py_lib/deepspeed_gds_utils.h (100%) create mode 100644 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp create mode 100644 csrc/gds/py_lib/deepspeed_py_gds_handle.h create mode 100644 csrc/gds/py_lib/py_ds_gds.cpp create mode 100644 csrc/gds/py_test/validate_gds.py mode change 100755 => 100644 csrc/xpu/includes/simd.h create mode 100755 deepspeed/ops/gds/__init__.py create mode 100644 op_builder/gds.py create mode 100644 tests/unit/ops/aio/test_gds.py diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h index 3cb3c5c3731f..a192804db13d 100644 --- a/csrc/aio/py_lib/deepspeed_aio_thread.h +++ b/csrc/aio/py_lib/deepspeed_aio_thread.h @@ -11,7 +11,6 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. #include #include #include "deepspeed_cpu_op.h" -#include "deepspeed_gds_op.h" struct thread_sync_t { std::mutex _mutex; diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp index 6a1696598ed8..767ad5d905e0 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp +++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp @@ -17,7 +17,9 @@ cpu_op_desc_t::cpu_op_desc_t(const bool read_op, : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate), _cpu_buffer(buffer) { - if (_buffer.is_cuda()) { + // Need to use CPU bounce buffer if buffer is not a page-locked DRAM memory. + _use_bounce_buffer = !(_buffer.is_cpu() && _buffer.is_pinned()); + if (_use_bounce_buffer) { if (_read_op) { auto options = torch::TensorOptions() .dtype(_buffer.dtype()) @@ -28,7 +30,6 @@ cpu_op_desc_t::cpu_op_desc_t(const bool read_op, _cpu_buffer = _buffer.to(torch::kCPU).pin_memory(); } } - _contiguous_buffer = _cpu_buffer.contiguous(); } @@ -36,7 +37,9 @@ char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_pt void cpu_op_desc_t::fini() { - if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } + if (_read_op) { + if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } + } } void cpu_op_desc_t::validate() diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h index d61fe4f3c545..07a4369674fc 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.h +++ b/csrc/aio/py_lib/deepspeed_cpu_op.h @@ -9,6 +9,7 @@ struct cpu_op_desc_t : io_op_desc_t { torch::Tensor _cpu_buffer; + bool _use_bounce_buffer; cpu_op_desc_t(const bool read_op, const torch::Tensor& buffer, diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index f13cd6dd06db..a6a68ee1a1d0 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -12,56 +12,23 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. using namespace std; -bool deepspeed_aio_handle_t::s_cuFile_init = false; - static void _start_aio_thread(std::shared_ptr ctxt) { ctxt->run(); } -static std::shared_ptr _create_io_op_desc(const bool read_op, - const torch::Tensor& buffer, - const int fd, - const char* filename, - const long long int file_num_bytes, - const int num_threads, - const bool validate, - const bool use_gds) -{ - if (buffer.is_cuda() && use_gds) { - return std::make_shared( - read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); - } else { - return std::make_shared( - read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); - } -} - deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, const int queue_depth, const bool single_submit, const bool overlap_events, - const bool use_gds, const int num_threads) : _aio_ctxt(new aio_context(block_size, queue_depth)), _single_submit(single_submit), _overlap_events(overlap_events), - _use_gds(use_gds), _num_threads(num_threads), _aio_config(block_size, queue_depth, single_submit, overlap_events, false), _num_pending_ops(0), _pinned_tensor_mgr(new deepspeed_pin_tensor_t()) { - - if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) { - init_gds_cufile(block_size, queue_depth, num_threads); - deepspeed_aio_handle_t::s_cuFile_init = true; - } - if (use_gds) { - // GDS threads handled in cufile.json - _thread_contexts.push_back(std::make_shared(0, _aio_config)); - _num_threads = 1; - } else { // CPU OP - for (auto i = 0; i < num_threads; ++i) { - _thread_contexts.push_back(std::make_shared(i, _aio_config)); - } + for (auto i = 0; i < num_threads; ++i) { + _thread_contexts.push_back(std::make_shared(i, _aio_config)); } for (auto& ctxt : _thread_contexts) { @@ -73,7 +40,6 @@ deepspeed_aio_handle_t::~deepspeed_aio_handle_t() { _stop_threads(); for (auto& thr : _threads) { thr.join(); } - if (_use_gds) {close_gds();} } const int deepspeed_aio_handle_t::get_block_size() const @@ -90,8 +56,6 @@ const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_su const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; } -const bool deepspeed_aio_handle_t::get_use_gds() const { return _use_gds; } - const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; } int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate) @@ -239,6 +203,19 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op, return true; } +std::shared_ptr deepspeed_aio_handle_t::_create_io_op_desc( + const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate) +{ + return std::make_shared( + read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); +} + int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer, const char* filename, const bool validate, @@ -263,8 +240,8 @@ int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer, const auto fd = open_file(filename, true); if (fd == -1) { return -1; } - auto scheduled_op = _create_io_op_desc( - true, buffer, fd, filename, num_file_bytes, _num_threads, validate, _use_gds); + auto scheduled_op = + _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, _num_threads, validate); _schedule_aio_work(scheduled_op); @@ -278,7 +255,6 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer, const bool validate, const bool async) { - const auto num_write_bytes = static_cast(buffer.nbytes()); assert((num_write_bytes % _num_threads) == 0); @@ -287,8 +263,8 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer, const auto fd = open_file(filename, false); if (fd == -1) { return -1; } - auto scheduled_op = _create_io_op_desc( - false, buffer, fd, filename, num_write_bytes, _num_threads, validate, _use_gds); + auto scheduled_op = + _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, _num_threads, validate); _schedule_aio_work(scheduled_op); @@ -327,13 +303,3 @@ bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor { return _pinned_tensor_mgr->free(locked_tensor); } - -int deepspeed_aio_handle_t::new_device_locked_tensor(const torch::Tensor& buffer) -{ - return register_buffer(buffer); -} - -int deepspeed_aio_handle_t::free_device_locked_tensor(const torch::Tensor& buffer) -{ - return deregister_buffer(buffer); -} diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h index bc3f6818d402..180d9aba2f9c 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h @@ -16,10 +16,8 @@ struct deepspeed_aio_handle_t { std::unique_ptr _aio_ctxt; const bool _single_submit; const bool _overlap_events; - const bool _use_gds; int _num_threads; deepspeed_aio_config_t _aio_config; - static bool s_cuFile_init; std::vector> _thread_contexts; std::vector _threads; @@ -30,7 +28,6 @@ struct deepspeed_aio_handle_t { const int queue_depth, const bool single_submit, const bool overlap_events, - const bool use_gds, const int num_threads); ~deepspeed_aio_handle_t(); @@ -39,7 +36,6 @@ struct deepspeed_aio_handle_t { const int get_queue_depth() const; const bool get_single_submit() const; const bool get_overlap_events() const; - const bool get_use_gds() const; const int get_thread_count() const; int read(torch::Tensor& buffer, const char* filename, const bool validate); @@ -69,10 +65,6 @@ struct deepspeed_aio_handle_t { bool free_cpu_locked_tensor(torch::Tensor&); - int new_device_locked_tensor(const torch::Tensor& example_tensor); - - int free_device_locked_tensor(const torch::Tensor& example_tensor); - int wait(); void _stop_threads(); @@ -82,4 +74,13 @@ struct deepspeed_aio_handle_t { std::shared_ptr _wait_for_aio_work(); bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes); + + virtual std::shared_ptr _create_io_op_desc( + const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate); }; diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp index 561c46f7c287..f5480e9d9d83 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.cpp +++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp @@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices. #include "deepspeed_py_copy.h" #include -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) or defined(__AVX256__) union AVX_Data { diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp index 8e1736013d6e..78e91c1607b1 100755 --- a/csrc/aio/py_lib/py_ds_aio.cpp +++ b/csrc/aio/py_lib/py_ds_aio.cpp @@ -21,20 +21,18 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy"); py::class_(m, "aio_handle") - .def(py::init(), + .def(py::init(), "AIO handle constructor", "block_size"_a, "queue_depth"_a, "single_submit"_a, "overlap_events"_a, - "use_gds"_a, "num_threads"_a) .def("get_block_size", &deepspeed_aio_handle_t::get_block_size) .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth) .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit) .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events) - .def("get_use_gds", &deepspeed_aio_handle_t::get_use_gds) .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count) .def("read", &deepspeed_aio_handle_t::read) @@ -50,8 +48,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor) .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor) - .def("new_device_locked_tensor", &deepspeed_aio_handle_t::new_device_locked_tensor) - .def("free_device_locked_tensor", &deepspeed_aio_handle_t::free_device_locked_tensor) .def("wait", &deepspeed_aio_handle_t::wait); } diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 9e55ae6cacf1..969afe39cee2 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -42,7 +42,7 @@ def pre_handle(args, tid, read_op): io_parallel = args.io_parallel if args.io_parallel else 1 handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, - not args.sequential_requests, gds,io_parallel) + not args.sequential_requests, gds, io_parallel) if gds: handle.new_device_locked_tensor(buffer) task_log(tid, f'created deepspeed aio handle') diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 14fa0027e004..7c2cb46f83fc 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -99,7 +99,7 @@ for xtype in cpu gpu gds; do OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}" LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" - + echo ${DISABLE_CACHE} echo ${cmd} echo ${SYNC} diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h index 7f6a2430845a..dcbdc11c27ad 100644 --- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h +++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h @@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){}; + : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}; }; private: @@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){}; + : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}; }; private: diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp similarity index 54% rename from csrc/aio/py_lib/deepspeed_gds_op.cpp rename to csrc/gds/py_lib/deepspeed_gds_op.cpp index 207477ef455a..b8af24f09070 100644 --- a/csrc/aio/py_lib/deepspeed_gds_op.cpp +++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp @@ -11,39 +11,11 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. using namespace std; -#ifdef __ENABLE_GDS__ -void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads) -{ - std::string depthStr = std::to_string(queue_depth); - std::string threadsStr = std::to_string(num_threads); - std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", "; - std::string json2 = R"("max_request_parallelism": )"+threadsStr+", "; - std::string json3 = R"("max_io_threads": )"+threadsStr+", "; - std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})"; - std::ofstream outFile("local_cufile.json"); - if (outFile.is_open()){ - outFile << json1 + json2 + json3 + json4; - outFile.close(); - } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);} - putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json"); - cuFileDriverOpen(); - cudaCheckError(); - size_t direct_io_size = (size_t)block_size / 1024; - CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size); - if (status.err != CU_FILE_SUCCESS) { - std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl; - exit(EXIT_FAILURE); - } -}; - -// TODO: deregister and release all buffers -void close_gds() {cuFileDriverClose();} - // For when there is more than 1 device // static std::set base_buffer_registry; static std::map> base_ptr_registry; -void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle) +static void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle) { memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t)); cf_descr.handle.fd = fd; @@ -56,108 +28,44 @@ void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t } } -gds_op_desc_t::gds_op_desc_t(const bool read_op, - const torch::Tensor& buffer, - const int fd, - const char* filename, - const long long int file_num_bytes, - const int num_threads, - const bool validate) - : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate) +static void* _find_base_ptr(const int64_t device, char* buf_ptr) { - // assert(_buffer.is_cuda()); - _contiguous_buffer = _buffer.contiguous(); - - const int64_t device = _buffer.get_device(); - - char * buf_ptr = (char *)_contiguous_buffer.data_ptr(); + void* base_ptr = nullptr; int64_t last = -1; int64_t ptr_diff; for (const auto& value : base_ptr_registry[device]) { - ptr_diff = buf_ptr - (char *)value; + ptr_diff = buf_ptr - (char*)value; if (last == -1 && ptr_diff >= 0) { last = ptr_diff; - _base_ptr = value; - } - else if ( ptr_diff < last && ptr_diff >= 0) { + base_ptr = value; + } else if (ptr_diff < last && ptr_diff >= 0) { last = ptr_diff; - _base_ptr = value; + base_ptr = value; } } - if (_contiguous_buffer.data_ptr() < _base_ptr) { - std::cerr << "BASE PTR ERROR :" << _base_ptr << " BUF PTR " << _contiguous_buffer.data_ptr() << std::endl; + if (!base_ptr || buf_ptr < base_ptr) { + std::cerr << "BASE PTR ERROR :" << base_ptr << " BUF PTR " << (void*)buf_ptr << std::endl; for (const auto& value : base_ptr_registry[device]) { - std::cerr << "BASE PTR AVAIL :" << value << std::endl; + std::cerr << "BASE PTR AVAIL :" << value << std::endl; } exit(EXIT_FAILURE); } - check_cudaruntimecall(cudaSetDevice(device)); - - _safe_handle_register(fd, _cf_descr, _cf_handle); - -} - -char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } - -void gds_op_desc_t::fini() -{ - //check_cuFileCall(cuFileBufDeregister(_buffer.data_ptr()), "file buffer deregister"); - cuFileHandleDeregister(_cf_handle); -} - -void gds_op_desc_t::validate() -{ - - check_cudaruntimecall(cudaSetDevice(_buffer.get_device())); - const auto cpu_buffer = _buffer.to(torch::kCPU); - validate_aio_operation( - _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes); -} - -void gds_op_desc_t::run(const int tid, - std::unique_ptr& aio_ctxt, - deepspeed_aio_config_t* aio_config) -{ - assert(tid < _num_threads); - check_cudaruntimecall(cudaSetDevice(_buffer.get_device())); - int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char *)_base_ptr; - const auto file_offset = _num_bytes_per_thread * tid; - - if (_read_op) { - auto ret = cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); - if (ret < 0) { _report_error(ret, errno, buf_offset); } - } else { - auto ret = cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); - if (ret < 0) { _report_error(ret, errno, buf_offset); } - } -} -void gds_op_desc_t::_report_error(const ssize_t return_code, - const int error_num, - const off_t offset) -{ - const auto op_string = _read_op ? "read failed with " : "write failed with "; - const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: "; - const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code) - : cuFileGetErrorString(error_num); - std::cerr << op_string << error_string << error_code << " return code = " << return_code - << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread - << " offset = " << offset << std::endl; - exit(EXIT_FAILURE); + return base_ptr; } -int register_buffer(const torch::Tensor& buffer) +void gds_op_desc_t::add_buffer_to_registry(const torch::Tensor& buffer) { const int64_t device = buffer.get_device(); - void * reg_ptr = buffer.data_ptr(); + void* reg_ptr = buffer.data_ptr(); // std::cout << "REG PTR " << reg_ptr << std::endl; // TODO: add checking to make sure pointer isn't already in set const auto it = base_ptr_registry.find(device); if (it == base_ptr_registry.end()) { - std::set new_ptr_set; + std::set new_ptr_set; new_ptr_set.insert(reg_ptr); - base_ptr_registry.insert(std::pair>(device, new_ptr_set)); + base_ptr_registry.insert(std::pair>(device, new_ptr_set)); } else { base_ptr_registry[device].insert(reg_ptr); } @@ -168,13 +76,12 @@ int register_buffer(const torch::Tensor& buffer) std::cerr << "buffer register failed:" << cuFileGetErrorString(status) << std::endl; exit(EXIT_FAILURE); } - return 0; } -int deregister_buffer(const torch::Tensor& buffer) +void gds_op_desc_t::remove_buffer_from_registry(const torch::Tensor& buffer) { const int64_t device = buffer.get_device(); - void * reg_ptr = buffer.data_ptr(); + void* reg_ptr = buffer.data_ptr(); // std::cout << "DEREG PTR " << reg_ptr << std::endl; check_cudaruntimecall(cudaSetDevice(device)); @@ -182,19 +89,8 @@ int deregister_buffer(const torch::Tensor& buffer) // Remove from tracked registry base_ptr_registry[device].erase(reg_ptr); - return 0; } -#else -void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads) -{ - std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; - exit(EXIT_FAILURE); -}; -void close_gds() -{ - std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; - exit(EXIT_FAILURE); -}; + gds_op_desc_t::gds_op_desc_t(const bool read_op, const torch::Tensor& buffer, const int fd, @@ -204,17 +100,56 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op, const bool validate) : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate) { - std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; - exit(EXIT_FAILURE); -}; -int register_buffer(const torch::Tensor& buffer) + _contiguous_buffer = _buffer.contiguous(); + const int64_t device = _buffer.get_device(); + check_cudaruntimecall(cudaSetDevice(device)); + _base_ptr = _find_base_ptr(device, (char*)_contiguous_buffer.data_ptr()); + + _safe_handle_register(fd, _cf_descr, _cf_handle); +} + +char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } + +void gds_op_desc_t::fini() { cuFileHandleDeregister(_cf_handle); } + +void gds_op_desc_t::validate() { - std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; - exit(EXIT_FAILURE); -}; -int deregister_buffer(const torch::Tensor& buffer) + check_cudaruntimecall(cudaSetDevice(_buffer.get_device())); + const auto cpu_buffer = _buffer.to(torch::kCPU); + validate_aio_operation( + _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes); +} + +void gds_op_desc_t::run(const int tid, + std::unique_ptr& aio_ctxt, + deepspeed_aio_config_t* aio_config) { - std::cerr << "Library compiled without __ENABLE_GDS__" << std::endl; + assert(tid < _num_threads); + check_cudaruntimecall(cudaSetDevice(_buffer.get_device())); + int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr; + const auto file_offset = _num_bytes_per_thread * tid; + + if (_read_op) { + auto ret = + cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); + if (ret < 0) { _report_error(ret, errno, buf_offset); } + } else { + auto ret = + cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); + if (ret < 0) { _report_error(ret, errno, buf_offset); } + } +} + +void gds_op_desc_t::_report_error(const ssize_t return_code, + const int error_num, + const off_t offset) +{ + const auto op_string = _read_op ? "read failed with " : "write failed with "; + const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: "; + const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code) + : cuFileGetErrorString(error_num); + std::cerr << op_string << error_string << error_code << " return code = " << return_code + << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread + << " offset = " << offset << std::endl; exit(EXIT_FAILURE); -}; -#endif +} diff --git a/csrc/aio/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h similarity index 68% rename from csrc/aio/py_lib/deepspeed_gds_op.h rename to csrc/gds/py_lib/deepspeed_gds_op.h index 1e955aa67558..3ad8b9ecf58d 100644 --- a/csrc/aio/py_lib/deepspeed_gds_op.h +++ b/csrc/gds/py_lib/deepspeed_gds_op.h @@ -3,23 +3,19 @@ // DeepSpeed Team +#include +#include #include #include -#include -#include -#include #include +#include #include "deepspeed_aio_op_desc.h" -#ifdef __ENABLE_GDS__ #include "deepspeed_gds_utils.h" -#endif struct gds_op_desc_t : io_op_desc_t { - #ifdef __ENABLE_GDS__ - CUfileDescr_t _cf_descr; - CUfileHandle_t _cf_handle; - #endif + CUfileDescr_t _cf_descr; + CUfileHandle_t _cf_handle; void* _base_ptr; gds_op_desc_t(const bool read_op, @@ -40,17 +36,9 @@ struct gds_op_desc_t : io_op_desc_t { void fini(); - void _read_file(const int tid); - - void _write_file(const int tid); - void _report_error(const ssize_t return_code, const int error_num, const off_t offset); -}; -int register_buffer(const torch::Tensor& buffer); + static void add_buffer_to_registry(const torch::Tensor& buffer); -int deregister_buffer(const torch::Tensor& buffer); - -void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads); - -void close_gds(); + static void remove_buffer_from_registry(const torch::Tensor& buffer); +}; diff --git a/csrc/aio/py_lib/deepspeed_gds_utils.h b/csrc/gds/py_lib/deepspeed_gds_utils.h similarity index 100% rename from csrc/aio/py_lib/deepspeed_gds_utils.h rename to csrc/gds/py_lib/deepspeed_gds_utils.h diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp new file mode 100644 index 000000000000..859ca19535a4 --- /dev/null +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +/* + GPUDirect Storage functionality for swapping optimizer tensors to/from (NVMe) storage devices. +*/ + +#include "deepspeed_py_gds_handle.h" +#include +#include "deepspeed_gds_op.h" + +using namespace std; + +int deepspeed_gds_handle_t::s_cuFile_init = 0; + +deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size, + const int queue_depth, + const bool single_submit, + const bool overlap_events, + const int num_threads) + : deepspeed_aio_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads) +{ + _init_cuFile(block_size, queue_depth, num_threads); +} + +deepspeed_gds_handle_t::~deepspeed_gds_handle_t() { _close_cuFile(); } + +void deepspeed_gds_handle_t::_init_cuFile(const int block_size, + const int queue_depth, + const int num_threads) +{ + if (deepspeed_gds_handle_t::s_cuFile_init == 0) { + std::string depthStr = std::to_string(queue_depth); + std::string threadsStr = std::to_string(num_threads); + std::string json1 = R"({"execution": {"max_io_queue_depth": )" + depthStr + ", "; + std::string json2 = R"("max_request_parallelism": )" + threadsStr + ", "; + std::string json3 = R"("max_io_threads": )" + threadsStr + ", "; + std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})"; + std::ofstream outFile("local_cufile.json"); + if (outFile.is_open()) { + outFile << json1 + json2 + json3 + json4; + outFile.close(); + } else { + std::cerr << "Can't open local cufile" << std::endl; + exit(EXIT_FAILURE); + } + putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json"); + cuFileDriverOpen(); + cudaCheckError(); + size_t direct_io_size = (size_t)block_size / 1024; + CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size); + if (status.err != CU_FILE_SUCCESS) { + std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl; + exit(EXIT_FAILURE); + } + } + deepspeed_gds_handle_t::s_cuFile_init++; +} + +void deepspeed_gds_handle_t::_close_cuFile() +{ + deepspeed_gds_handle_t::s_cuFile_init--; + if (deepspeed_gds_handle_t::s_cuFile_init == 0) { cuFileDriverClose(); } +} + +int deepspeed_gds_handle_t::new_device_locked_tensor(const torch::Tensor& buffer) +{ + gds_op_desc_t::add_buffer_to_registry(buffer); + return 0; +} + +int deepspeed_gds_handle_t::free_device_locked_tensor(const torch::Tensor& buffer) +{ + gds_op_desc_t::remove_buffer_from_registry(buffer); + return 0; +} + +std::shared_ptr deepspeed_gds_handle_t::_create_io_op_desc( + const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate) +{ + if (buffer.is_cuda()) { + return std::make_shared( + read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); + } + return deepspeed_aio_handle_t::_create_io_op_desc( + read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); +} diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h new file mode 100644 index 000000000000..0e42b07a49dc --- /dev/null +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +/* +Functionality for swapping optimizer tensors to/from (NVMe) storage devices. +*/ + +#include +#include +#include "deepspeed_py_aio_handle.h" + +struct deepspeed_gds_handle_t : deepspeed_aio_handle_t { + deepspeed_gds_handle_t(const int block_size, + const int queue_depth, + const bool single_submit, + const bool overlap_events, + const int num_threads); + + ~deepspeed_gds_handle_t(); + + int new_device_locked_tensor(const torch::Tensor& buffer); + + int free_device_locked_tensor(const torch::Tensor& buffer); + + void _init_cuFile(const int block_size, const int queue_length, const int num_threads); + + void _close_cuFile(); + + std::shared_ptr _create_io_op_desc(const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const int num_threads, + const bool validate); + + static int s_cuFile_init; +}; diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp new file mode 100644 index 000000000000..7d1c34e3bcad --- /dev/null +++ b/csrc/gds/py_lib/py_ds_gds.cpp @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +/* +Functionality for swapping optimizer tensors to/from (NVMe) storage devices. +*/ + +#include +#include "deepspeed_py_gds_handle.h" +using namespace pybind11::literals; + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + py::class_(m, "gds_handle") + .def(py::init(), + "GDS handle constructor", + "block_size"_a, + "queue_depth"_a, + "single_submit"_a, + "overlap_events"_a, + "num_threads"_a) + + .def("get_block_size", &deepspeed_gds_handle_t::get_block_size) + .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth) + .def("get_single_submit", &deepspeed_gds_handle_t::get_single_submit) + .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events) + .def("get_thread_count", &deepspeed_gds_handle_t::get_thread_count) + + .def("read", &deepspeed_gds_handle_t::read) + .def("write", &deepspeed_gds_handle_t::write) + + .def("pread", &deepspeed_gds_handle_t::pread) + .def("pwrite", &deepspeed_gds_handle_t::pwrite) + + .def("sync_pread", &deepspeed_gds_handle_t::sync_pread) + .def("sync_pwrite", &deepspeed_gds_handle_t::sync_pwrite) + .def("async_pread", &deepspeed_gds_handle_t::async_pread) + .def("async_pwrite", &deepspeed_gds_handle_t::async_pwrite) + + .def("new_cpu_locked_tensor", &deepspeed_gds_handle_t::new_cpu_locked_tensor) + .def("free_cpu_locked_tensor", &deepspeed_gds_handle_t::free_cpu_locked_tensor) + .def("new_device_locked_tensor", &deepspeed_gds_handle_t::new_device_locked_tensor) + .def("free_device_locked_tensor", &deepspeed_gds_handle_t::free_device_locked_tensor) + + .def("wait", &deepspeed_gds_handle_t::wait); +} diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py new file mode 100644 index 000000000000..b34b1194f582 --- /dev/null +++ b/csrc/gds/py_test/validate_gds.py @@ -0,0 +1,10 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +""" +Functionality of swapping optimizer tensors to/from (NVMe) storage devices. +""" +from deepspeed.ops.op_builder import GDSBuilder +assert GDSBuilder().is_compatible(True) +assert GDSBuilder().load(True) diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h index f5bfb45dd2e2..a205026ec7c1 100644 --- a/csrc/includes/simd.h +++ b/csrc/includes/simd.h @@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val) std::memcpy(dst, &val, sizeof(T)); } -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h old mode 100755 new mode 100644 index f77568be7835..097e2d8585cc --- a/csrc/xpu/includes/simd.h +++ b/csrc/xpu/includes/simd.h @@ -13,7 +13,7 @@ #define TILE (128 * 1024 * 1024) #if defined(__AVX512__) or defined(__AVX256__) -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h index fa41757c895b..1897afd1fea2 100644 --- a/csrc/xpu/includes/type_shim.h +++ b/csrc/xpu/includes/type_shim.h @@ -82,11 +82,11 @@ } template -__inline__ __attribute__((always_inline)) T reduce_block_into_lanes( - T* x, - T val, - int lanes = 1, - bool share_result = false) // lanes is intended to be <= 32. +__inline__ __attribute__((always_inline)) T +reduce_block_into_lanes(T* x, + T val, + int lanes = 1, + bool share_result = false) // lanes is intended to be <= 32. { auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2); diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py index 53b0c8072923..039b999dfeca 100644 --- a/deepspeed/elasticity/elastic_agent.py +++ b/deepspeed/elasticity/elastic_agent.py @@ -28,6 +28,7 @@ log = get_logger(__name__) + class DSElasticAgent(LocalElasticAgent): def __init__( diff --git a/deepspeed/ops/gds/__init__.py b/deepspeed/ops/gds/__init__.py new file mode 100755 index 000000000000..3c0762c81076 --- /dev/null +++ b/deepspeed/ops/gds/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from ..op_builder import GDSBuilder diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py index df4a38380089..46c3f2a0c954 100644 --- a/deepspeed/runtime/swap_tensor/aio_config.py +++ b/deepspeed/runtime/swap_tensor/aio_config.py @@ -5,25 +5,33 @@ from deepspeed.runtime.config_utils import get_scalar_param from deepspeed.runtime.swap_tensor.constants import * +from deepspeed.accelerator import get_accelerator AIO_DEFAULT_DICT = { AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT, AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT, AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT, AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT, - AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT + AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT, + AIO_USE_GDS: AIO_USE_GDS_DEFAULT } def get_aio_config(param_dict): if AIO in param_dict.keys() and param_dict[AIO] is not None: aio_dict = param_dict[AIO] - return { + aio_config = { AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT), AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT), AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT), AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT), - AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT) + AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT), + AIO_USE_GDS: get_scalar_param(aio_dict, AIO_USE_GDS, AIO_USE_GDS_DEFAULT) } + if aio_config[AIO_USE_GDS]: + assert get_accelerator().device_name() == 'cuda', 'GDS currently only supported for CUDA accelerator' + + return aio_config + return AIO_DEFAULT_DICT diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py index 4c9722bc4e4f..cee20ac7b78c 100644 --- a/deepspeed/runtime/swap_tensor/constants.py +++ b/deepspeed/runtime/swap_tensor/constants.py @@ -11,7 +11,8 @@ "queue_depth": 8, "thread_count": 1, "single_submit": false, - "overlap_events": true + "overlap_events": true, + "use_gds": false } ''' AIO = "aio" @@ -25,3 +26,5 @@ AIO_SINGLE_SUBMIT_DEFAULT = False AIO_OVERLAP_EVENTS = "overlap_events" AIO_OVERLAP_EVENTS_DEFAULT = True +AIO_USE_GDS = "use_gds" +AIO_USE_GDS_DEFAULT = False diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py index 6f09a687d98c..120723fae5ab 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py @@ -13,6 +13,7 @@ from deepspeed import comm as dist from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import AsyncIOBuilder +from deepspeed.ops.op_builder import GDSBuilder from .constants import * from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, AIO_ALIGNED_BYTES, print_object, SwapBufferPool @@ -37,9 +38,6 @@ class AsyncPartitionedParameterSwapper(object): def __init__(self, ds_config, model_dtype): - aio_op = AsyncIOBuilder().load(verbose=False) - self.aio_handle = aio_op.aio_handle - self.use_gds = True self.dtype = model_dtype #set swap buffers, create aio handles @@ -94,6 +92,10 @@ def _configure_aio(self, ds_config): self.aio_config = ds_config.aio_config + self.use_gds = self.aio_config[AIO_USE_GDS] + self.aio_handle = GDSBuilder().load(verbose=False).gds_handle if self.use_gds else AsyncIOBuilder().load( + verbose=False).aio_handle + # Read/Write alignment for each thread during Intra-request parallelism self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE]) self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_THREAD_COUNT] @@ -107,18 +109,17 @@ def _configure_aio(self, ds_config): self.reserved_buffer_ids = [] self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH], - self.aio_config[AIO_SINGLE_SUBMIT], - self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT]) + self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS], + self.aio_config[AIO_THREAD_COUNT]) self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH], self.aio_config[AIO_SINGLE_SUBMIT], - self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT]) + self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT]) if self.use_gds: - self.buffers = torch.empty(int(self.aligned_elements_per_buffer * - self.param_buffer_count), + self.buffers = torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count), dtype=self.dtype, - device='cuda', # gotta be cuda + device=get_accelerator().device_name(), requires_grad=False) self.aio_read_handle.new_device_locked_tensor(self.buffers) else: diff --git a/op_builder/async_io.py b/op_builder/async_io.py index a8620387d209..a9039def3a40 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -20,15 +20,18 @@ def __init__(self): def absolute_name(self): return f'deepspeed.ops.aio.{self.NAME}_op' - def sources(self): - return [ - 'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp', + def lib_sources(self): + src_list = [ 'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp', - 'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_gds_op.cpp', - 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' + 'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', + 'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' ] + return src_list + + def sources(self): + return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp'] def include_paths(self): import torch @@ -47,21 +50,9 @@ def include_paths(self): def cxx_args(self): # -O0 for improved debugging, since performance is bound by I/O args = super().cxx_args() - GDS_ENABLE = self.is_gds_enable() - args += [ - '-Wall', - '-O0', - '-shared', - '-fPIC', - '-Wno-reorder', - GDS_ENABLE - ] - + args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder'] return args - def is_gds_enable(self): - return '-D__ENABLE_GDS__' - def extra_ldflags(self): if self.build_for_cpu: return ['-fopenmp'] @@ -69,7 +60,8 @@ def extra_ldflags(self): import torch.utils.cpp_extension CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64") - return [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart', '-lcufile'] + ldflags = [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart'] + return ldflags def check_for_libaio_pkg(self): libs = dict( diff --git a/op_builder/builder.py b/op_builder/builder.py index 03611bf56284..cc87e962c853 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -284,7 +284,7 @@ def is_compatible(self, verbose=True): def extra_ldflags(self): return [] - def has_function(self, funcname, libraries, verbose=False): + def has_function(self, funcname, libraries, library_dirs=None, verbose=False): ''' Test for existence of a function within a tuple of libraries. @@ -340,7 +340,8 @@ def has_function(self, funcname, libraries, verbose=False): compiler.link_executable(objs, os.path.join(tempdir, 'a.out'), extra_preargs=self.strip_empty_entries(ldflags), - libraries=libraries) + libraries=libraries, + library_dirs=library_dirs) # Compile and link succeeded return True diff --git a/op_builder/gds.py b/op_builder/gds.py new file mode 100644 index 000000000000..3b06ca16f40d --- /dev/null +++ b/op_builder/gds.py @@ -0,0 +1,50 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import os +from .async_io import AsyncIOBuilder + + +class GDSBuilder(AsyncIOBuilder): + BUILD_VAR = "DS_BUILD_GDS" + NAME = "gds" + + def __init__(self): + super().__init__() + + def absolute_name(self): + return f'deepspeed.ops.gds.{self.NAME}_op' + + def lib_sources(self): + src_list = ['csrc/gds/py_lib/deepspeed_py_gds_handle.cpp', 'csrc/gds/py_lib/deepspeed_gds_op.cpp'] + return super().lib_sources() + src_list + + def sources(self): + return self.lib_sources() + ['csrc/gds/py_lib/py_ds_gds.cpp'] + + def cxx_args(self): + return super().cxx_args() + ['-lcufile'] + + def include_paths(self): + import torch + CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] + return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE + + def extra_ldflags(self): + return super().extra_ldflags() + ['-lcufile'] + + def is_compatible(self, verbose=True): + import torch.utils.cpp_extension + CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME + CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64") + gds_compatible = self.has_function(funcname="cuFileDriverOpen", + libraries=("cufile", ), + library_dirs=( + CUDA_HOME, + CUDA_LIB64, + ), + verbose=verbose) + + return gds_compatible and super().is_compatible(verbose) diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py index eb6ddd4da8cb..e6927efc3824 100644 --- a/tests/unit/ops/aio/test_aio.py +++ b/tests/unit/ops/aio/test_aio.py @@ -13,26 +13,22 @@ from deepspeed.ops.op_builder import AsyncIOBuilder from unit.common import DistributedTest -KILO_BYTE = 1024*256 +KILO_BYTE = 1024 BLOCK_SIZE = KILO_BYTE QUEUE_DEPTH = 2 IO_SIZE = 4 * BLOCK_SIZE IO_PARALLEL = 2 -GDS_ENABLE=True if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]: pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True) -def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True, use_gds=False): +def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True): if not get_accelerator().is_available(): if use_cuda_device: pytest.skip("GPU tensors only supported in CUDA environments.") if use_cuda_pinned_tensor: pytest.skip("CUDA-pinned tensors only supported in CUDA environments.") - if not GDS_ENABLE and use_gds: - pytest.skip("GDS not available, won't run GDS case.") - def _get_local_rank(): @@ -62,6 +58,7 @@ def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0): return test_file, test_buffer +def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0): test_file = _get_test_write_file(tmpdir, index) if aio_handle is None: test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer))) @@ -73,19 +70,17 @@ def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0): return test_file, test_buffer -def _validate_handle_state(handle, single_submit, overlap_events, use_gds): +def _validate_handle_state(handle, single_submit, overlap_events): assert handle.get_single_submit() == single_submit assert handle.get_overlap_events() == overlap_events - if use_gds: - assert handle.get_thread_count() == 1 - else: - assert handle.get_thread_count() == IO_PARALLEL + assert handle.get_thread_count() == IO_PARALLEL assert handle.get_block_size() == BLOCK_SIZE assert handle.get_queue_depth() == QUEUE_DEPTH -@pytest.mark.parametrize("single_submit", [True,False]) -@pytest.mark.parametrize("overlap_events", [True,False]) -@pytest.mark.parametrize("use_cuda_pinned_tensor, use_gds", [(False,False),(True,False),(False,True)]) + +@pytest.mark.parametrize("use_cuda_pinned_tensor", [True]) # TODO: aio_handle pinned tensor API is broken +@pytest.mark.parametrize("single_submit", [True, False]) +@pytest.mark.parametrize("overlap_events", [True, False]) class TestRead(DistributedTest): world_size = 1 reuse_dist_env = True @@ -94,20 +89,17 @@ class TestRead(DistributedTest): init_distributed = False set_dist_env = False - def test_parallel_read(self, tmpdir, single_submit, overlap_events, use_cuda_pinned_tensor, use_gds): - _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) + def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events): + _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) if use_cuda_pinned_tensor: aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu')) - elif use_gds: - aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda') - h.new_device_locked_tensor(aio_buffer) else: aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8)) - _validate_handle_state(h, single_submit, overlap_events, use_gds) + _validate_handle_state(h, single_submit, overlap_events) ref_file, _ = _do_ref_write(tmpdir) read_status = h.sync_pread(aio_buffer, ref_file) @@ -117,17 +109,15 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events, use_cuda_pin ref_buffer = list(f.read()) assert ref_buffer == aio_buffer.tolist() - if use_gds: - h.free_device_locked_tensor(aio_buffer) - elif not use_cuda_pinned_tensor: + if not use_cuda_pinned_tensor: h.free_cpu_locked_tensor(aio_buffer) @pytest.mark.parametrize("cuda_device", [True, False]) - def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) + def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) use_cpu_locked_tensor = False - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) if cuda_device: aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) @@ -154,10 +144,9 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap h.free_cpu_locked_tensor(aio_buffer) -@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False]) +@pytest.mark.parametrize("use_cuda_pinned_tensor", [True]) # TODO: aio_handle pinned tensor API is broken @pytest.mark.parametrize("single_submit", [True, False]) @pytest.mark.parametrize("overlap_events", [True, False]) -@pytest.mark.parametrize("use_gds", [False]) class TestWrite(DistributedTest): world_size = 1 reuse_dist_env = True @@ -166,11 +155,11 @@ class TestWrite(DistributedTest): init_distributed = False set_dist_env = False - def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds): - _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) + def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events): + _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_file, ref_buffer = _do_ref_write(tmpdir) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) if use_cuda_pinned_tensor: aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer) @@ -191,12 +180,12 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove assert filecmp.cmp(ref_file, aio_file, shallow=False) @pytest.mark.parametrize("cuda_device", [True, False]) - def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) + def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_file, ref_buffer = _do_ref_write(tmpdir) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) use_cpu_locked_tensor = False if cuda_device: aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer) @@ -224,9 +213,8 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla @pytest.mark.sequential -@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False]) +@pytest.mark.parametrize("use_cuda_pinned_tensor", [True]) # TODO: aio_handle pinned tensor API is broken @pytest.mark.parametrize("cuda_device", [True, False]) -@pytest.mark.parametrize("use_gds", [False]) class TestAsyncQueue(DistributedTest): world_size = 1 requires_cuda_env = False @@ -235,8 +223,8 @@ class TestAsyncQueue(DistributedTest): set_dist_env = False @pytest.mark.parametrize("async_queue", [2, 3]) - def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, use_gds): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) + def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_files = [] for i in range(async_queue): @@ -245,7 +233,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, us single_submit = True overlap_events = True - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) use_cpu_locked_tensor = False if cuda_device: @@ -282,8 +270,8 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, us h.free_cpu_locked_tensor(t) @pytest.mark.parametrize("async_queue", [2, 3]) - def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device, use_gds): - _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds) + def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_files = [] ref_buffers = [] @@ -294,7 +282,7 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device, u single_submit = True overlap_events = True - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, useg_gds,IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) aio_files = [] aio_buffers = [] diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py new file mode 100644 index 000000000000..7afa5970d69f --- /dev/null +++ b/tests/unit/ops/aio/test_gds.py @@ -0,0 +1,244 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import os +import filecmp +import torch +import deepspeed +import deepspeed.comm as dist +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder import GDSBuilder +from unit.common import DistributedTest + +KILO_BYTE = 1024 * 256 +BLOCK_SIZE = KILO_BYTE +QUEUE_DEPTH = 2 +IO_SIZE = 4 * BLOCK_SIZE +IO_PARALLEL = 2 + +if not deepspeed.ops.__compatible_ops__[GDSBuilder.NAME]: + pytest.skip('Skip tests since gds is not compatible', allow_module_level=True) + + +def _get_local_rank(): + if get_accelerator().is_available(): + return dist.get_rank() + return 0 + + +def _do_ref_write(tmpdir, index=0): + file_suffix = f'{_get_local_rank()}_{index}' + ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt') + ref_buffer = os.urandom(IO_SIZE) + with open(ref_file, 'wb') as f: + f.write(ref_buffer) + + return ref_file, ref_buffer + + +def _get_test_write_file(tmpdir, index): + file_suffix = f'{_get_local_rank()}_{index}' + return os.path.join(tmpdir, f'_gds_write_random_{file_suffix}.pt') + + +def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index=0): + test_file = _get_test_write_file(tmpdir, index) + test_buffer = get_accelerator().ByteTensor(list(ref_buffer)) + gds_handle.new_device_locked_tensor(test_buffer) + return test_file, test_buffer + + +def _validate_handle_state(handle, single_submit, overlap_events): + assert handle.get_single_submit() == single_submit + assert handle.get_overlap_events() == overlap_events + assert handle.get_thread_count() == IO_PARALLEL + assert handle.get_block_size() == BLOCK_SIZE + assert handle.get_queue_depth() == QUEUE_DEPTH + + +@pytest.mark.parametrize("single_submit", [True, False]) +@pytest.mark.parametrize("overlap_events", [True, False]) +class TestRead(DistributedTest): + world_size = 1 + reuse_dist_env = True + requires_cuda_env = False + if not get_accelerator().is_available(): + init_distributed = False + set_dist_env = False + + def test_parallel_read(self, tmpdir, single_submit, overlap_events): + + h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + + gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) + h.new_device_locked_tensor(gds_buffer) + + _validate_handle_state(h, single_submit, overlap_events) + + ref_file, _ = _do_ref_write(tmpdir) + read_status = h.sync_pread(gds_buffer, ref_file) + assert read_status == 1 + + with open(ref_file, 'rb') as f: + ref_buffer = list(f.read()) + assert ref_buffer == gds_buffer.tolist() + + h.free_device_locked_tensor(gds_buffer) + + def test_async_read(self, tmpdir, single_submit, overlap_events): + + h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + + gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) + h.new_device_locked_tensor(gds_buffer) + + _validate_handle_state(h, single_submit, overlap_events) + + ref_file, _ = _do_ref_write(tmpdir) + read_status = h.async_pread(gds_buffer, ref_file) + assert read_status == 0 + + wait_status = h.wait() + assert wait_status == 1 + + with open(ref_file, 'rb') as f: + ref_buffer = list(f.read()) + assert ref_buffer == gds_buffer.tolist() + + h.free_device_locked_tensor(gds_buffer) + + +@pytest.mark.parametrize("single_submit", [True, False]) +@pytest.mark.parametrize("overlap_events", [True, False]) +class TestWrite(DistributedTest): + world_size = 1 + reuse_dist_env = True + requires_cuda_env = False + if not get_accelerator().is_available(): + init_distributed = False + set_dist_env = False + + def test_parallel_write(self, tmpdir, single_submit, overlap_events): + + ref_file, ref_buffer = _do_ref_write(tmpdir) + h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + + gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h) + + _validate_handle_state(h, single_submit, overlap_events) + + write_status = h.sync_pwrite(gds_buffer, gds_file) + assert write_status == 1 + + h.free_device_locked_tensor(gds_buffer) + + assert os.path.isfile(gds_file) + + filecmp.clear_cache() + assert filecmp.cmp(ref_file, gds_file, shallow=False) + + def test_async_write(self, tmpdir, single_submit, overlap_events): + ref_file, ref_buffer = _do_ref_write(tmpdir) + + h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h) + + _validate_handle_state(h, single_submit, overlap_events) + + write_status = h.async_pwrite(gds_buffer, gds_file) + assert write_status == 0 + + wait_status = h.wait() + assert wait_status == 1 + + h.free_device_locked_tensor(gds_buffer) + + assert os.path.isfile(gds_file) + + filecmp.clear_cache() + assert filecmp.cmp(ref_file, gds_file, shallow=False) + + +@pytest.mark.sequential +class TestAsyncQueue(DistributedTest): + world_size = 1 + requires_cuda_env = False + if not get_accelerator().is_available(): + init_distributed = False + set_dist_env = False + + @pytest.mark.parametrize("async_queue", [2, 3]) + def test_read(self, tmpdir, async_queue): + + ref_files = [] + for i in range(async_queue): + f, _ = _do_ref_write(tmpdir, i) + ref_files.append(f) + + single_submit = True + overlap_events = True + h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + + gds_buffers = [ + torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) for _ in range(async_queue) + ] + for buf in gds_buffers: + h.new_device_locked_tensor(buf) + + _validate_handle_state(h, single_submit, overlap_events) + + for i in range(async_queue): + read_status = h.async_pread(gds_buffers[i], ref_files[i]) + assert read_status == 0 + + wait_status = h.wait() + assert wait_status == async_queue + + for i in range(async_queue): + with open(ref_files[i], 'rb') as f: + ref_buffer = list(f.read()) + assert ref_buffer == gds_buffers[i].tolist() + + for t in gds_buffers: + h.free_device_locked_tensor(t) + + @pytest.mark.parametrize("async_queue", [2, 3]) + def test_write(self, tmpdir, async_queue): + ref_files = [] + ref_buffers = [] + for i in range(async_queue): + f, buf = _do_ref_write(tmpdir, i) + ref_files.append(f) + ref_buffers.append(buf) + + single_submit = True + overlap_events = True + h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) + + gds_files = [] + gds_buffers = [] + for i in range(async_queue): + f, buf = _get_test_write_file_and_device_buffer(tmpdir, ref_buffers[i], h, i) + gds_files.append(f) + gds_buffers.append(buf) + + _validate_handle_state(h, single_submit, overlap_events) + + for i in range(async_queue): + read_status = h.async_pwrite(gds_buffers[i], gds_files[i]) + assert read_status == 0 + + wait_status = h.wait() + assert wait_status == async_queue + + for t in gds_buffers: + h.free_device_locked_tensor(t) + + for i in range(async_queue): + assert os.path.isfile(gds_files[i]) + + filecmp.clear_cache() + assert filecmp.cmp(ref_files[i], gds_files[i], shallow=False) From 90baebac62e382c7ce855d2ab80471445559b5c9 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 6 Aug 2024 15:04:34 -0700 Subject: [PATCH 14/31] Formatting fix --- .../predicated_tile_access_iterator_residual_last.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h index dcbdc11c27ad..7f6a2430845a 100644 --- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h +++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h @@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}; + : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){}; }; private: @@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}; + : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){}; }; private: From bc4c5998ed860842ebe33991c62aec84c2f406f9 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 6 Aug 2024 15:21:53 -0700 Subject: [PATCH 15/31] Disable build GDS in pre compile ops --- .github/workflows/nv-pre-compile-ops.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml index a506bb27fda4..72ba8abbd95d 100644 --- a/.github/workflows/nv-pre-compile-ops.yml +++ b/.github/workflows/nv-pre-compile-ops.yml @@ -36,7 +36,7 @@ jobs: #python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - name: Compile DeepSpeed Ops run: | - DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install . + DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install . - name: DS Report run: | ds_report From 759d9f8a2aef06ec64fd1a2d7a64d2b8cc1aab32 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 7 Aug 2024 22:45:48 +0000 Subject: [PATCH 16/31] updating microbenchmark script --- csrc/aio/py_test/ds_aio_handle.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 969afe39cee2..a9c5a9d207d7 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -11,6 +11,7 @@ import time from multiprocessing import Pool, Barrier from deepspeed.ops.aio import AsyncIOBuilder +from deepspeed.ops.op_builder import GDSBuilder from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file from deepspeed.accelerator import get_accelerator @@ -41,10 +42,11 @@ def pre_handle(args, tid, read_op): force=True) io_parallel = args.io_parallel if args.io_parallel else 1 - handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, - not args.sequential_requests, gds, io_parallel) if gds: + handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel) handle.new_device_locked_tensor(buffer) + else: + handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel) task_log(tid, f'created deepspeed aio handle') ctxt = {} From d56ab675c216dc7f7c66621bf9047c87c8b1af29 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 9 Aug 2024 16:26:22 -0400 Subject: [PATCH 17/31] Fix formatting --- csrc/aio/common/deepspeed_aio_common.cpp | 5 +++-- csrc/aio/py_lib/deepspeed_py_aio.cpp | 10 ++++++---- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 10 ++++++---- csrc/aio/py_lib/deepspeed_py_copy.cpp | 2 +- csrc/aio/py_test/ds_aio_handle.py | 6 ++++-- .../evoformer_attn/gemm_kernel_utils.h | 9 +++++---- csrc/includes/simd.h | 2 +- csrc/xpu/includes/simd.h | 2 +- csrc/xpu/includes/type_shim.h | 10 +++++----- 9 files changed, 32 insertions(+), 24 deletions(-) diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp index a65cc500cc82..0f2895dfa328 100644 --- a/csrc/aio/common/deepspeed_aio_common.cpp +++ b/csrc/aio/common/deepspeed_aio_common.cpp @@ -301,8 +301,9 @@ int regular_read(const char* filename, std::vector& buffer) } while (r > 0); if (read_bytes != num_bytes) { - std::cerr << "read error " << " read_bytes (read) = " << read_bytes - << " num_bytes (fstat) = " << num_bytes << std::endl; + std::cerr << "read error " + << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes + << std::endl; } assert(read_bytes == num_bytes); close(fd); diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp index eac268d33433..30b6682ada72 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp @@ -69,8 +69,9 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } @@ -114,7 +115,8 @@ int deepspeed_py_aio_read(torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index 284c84c721f5..a6a68ee1a1d0 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -91,8 +91,9 @@ int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, co if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); } const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } @@ -125,8 +126,9 @@ int deepspeed_aio_handle_t::write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp index f5480e9d9d83..561c46f7c287 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.cpp +++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp @@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices. #include "deepspeed_py_copy.h" #include -#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) +#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) #if defined(__AVX512__) or defined(__AVX256__) union AVX_Data { diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index a9c5a9d207d7..881cbb7f2b8a 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -43,10 +43,12 @@ def pre_handle(args, tid, read_op): io_parallel = args.io_parallel if args.io_parallel else 1 if gds: - handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel) + handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, + not args.sequential_requests, io_parallel) handle.new_device_locked_tensor(buffer) else: - handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel) + handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, + not args.sequential_requests, io_parallel) task_log(tid, f'created deepspeed aio handle') ctxt = {} diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h index c102234a4dfb..2a4300c5cac1 100644 --- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h +++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h @@ -125,10 +125,11 @@ struct CheckArch { std::cerr << #PTR " is not correctly aligned\n"; \ return false; \ } -#define EVOFORMER_CHECK(COND, ERR) \ - if (!(COND)) { \ - std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \ - return false; \ +#define EVOFORMER_CHECK(COND, ERR) \ + if (!(COND)) { \ + std::cerr << "[Evoformer Attention]" \ + << "'" #COND "' failed: " << ERR << "\n"; \ + return false; \ } #endif diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h index a205026ec7c1..f5bfb45dd2e2 100644 --- a/csrc/includes/simd.h +++ b/csrc/includes/simd.h @@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val) std::memcpy(dst, &val, sizeof(T)); } -#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) +#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h index 097e2d8585cc..f77568be7835 100644 --- a/csrc/xpu/includes/simd.h +++ b/csrc/xpu/includes/simd.h @@ -13,7 +13,7 @@ #define TILE (128 * 1024 * 1024) #if defined(__AVX512__) or defined(__AVX256__) -#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) +#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h index 1897afd1fea2..fa41757c895b 100644 --- a/csrc/xpu/includes/type_shim.h +++ b/csrc/xpu/includes/type_shim.h @@ -82,11 +82,11 @@ } template -__inline__ __attribute__((always_inline)) T -reduce_block_into_lanes(T* x, - T val, - int lanes = 1, - bool share_result = false) // lanes is intended to be <= 32. +__inline__ __attribute__((always_inline)) T reduce_block_into_lanes( + T* x, + T val, + int lanes = 1, + bool share_result = false) // lanes is intended to be <= 32. { auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2); From 1e2082581cbc160489b756b00af5418e39b2a57c Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 9 Aug 2024 19:14:25 -0400 Subject: [PATCH 18/31] Formatting and Typo fixes --- csrc/aio/py_lib/deepspeed_aio_op_desc.cpp | 2 +- csrc/aio/py_lib/deepspeed_aio_op_desc.h | 2 +- csrc/aio/py_lib/deepspeed_cpu_op.cpp | 2 +- csrc/aio/py_lib/deepspeed_cpu_op.h | 2 +- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 4 ++-- csrc/aio/py_lib/deepspeed_py_aio_handle.h | 2 +- csrc/aio/py_test/ds_aio_basic.py | 4 ++-- csrc/aio/py_test/ds_aio_handle.py | 4 ++-- csrc/gds/py_lib/deepspeed_gds_op.cpp | 3 +-- csrc/gds/py_lib/deepspeed_gds_op.h | 2 +- 10 files changed, 13 insertions(+), 14 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp index 5c9bb033c0c2..dc820be528d0 100644 --- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp +++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp @@ -27,7 +27,7 @@ io_op_desc_t::io_op_desc_t(const bool read_op, char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } -void io_op_desc_t::fini() {} +void io_op_desc_t::finish() {} void io_op_desc_t::validate() {} diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h index c5bffae10265..7305f6920c91 100644 --- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h +++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h @@ -36,6 +36,6 @@ struct io_op_desc_t { virtual void validate(); - virtual void fini(); + virtual void finish(); }; #endif // _IO_OP_DESC_T_ diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp index 767ad5d905e0..b3b1932bd70b 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp +++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp @@ -35,7 +35,7 @@ cpu_op_desc_t::cpu_op_desc_t(const bool read_op, char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } -void cpu_op_desc_t::fini() +void cpu_op_desc_t::finish() { if (_read_op) { if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h index 07a4369674fc..da96dd2b1d50 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.h +++ b/csrc/aio/py_lib/deepspeed_cpu_op.h @@ -27,5 +27,5 @@ struct cpu_op_desc_t : io_op_desc_t { void validate(); - void fini(); + void finish(); }; diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index a6a68ee1a1d0..8f2c82f86968 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -179,7 +179,7 @@ int deepspeed_aio_handle_t::wait() if (completed_op->_validate) { completed_op->validate(); } - completed_op->fini(); + completed_op->finish(); close(completed_op->_fd); @@ -195,7 +195,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op, { const auto op_string = read_op ? "Read" : "Write"; if (num_bytes % get_thread_count()) { - std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes + std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes << " not divisible by thread count = " << get_thread_count() << std::endl; return false; } diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h index 180d9aba2f9c..c89998974743 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h @@ -16,7 +16,7 @@ struct deepspeed_aio_handle_t { std::unique_ptr _aio_ctxt; const bool _single_submit; const bool _overlap_events; - int _num_threads; + const int _num_threads; deepspeed_aio_config_t _aio_config; std::vector> _thread_contexts; diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py index e90886e17871..9b3c7cbfc49f 100755 --- a/csrc/aio/py_test/ds_aio_basic.py +++ b/csrc/aio/py_test/ds_aio_basic.py @@ -119,7 +119,7 @@ def _aio_handle_tasklet(pool_params): return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops -def _init_takslet(b): +def _init_tasklet(b): global aio_barrier aio_barrier = b @@ -128,7 +128,7 @@ def aio_basic_multiprocessing(args, read_op): num_processes = len(args.mapping_dict) b = Barrier(num_processes) pool_params = [(args, p, read_op) for p in range(num_processes)] - with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p: + with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p: pool_results = p.map(_aio_handle_tasklet, pool_params) report_results(args, read_op, pool_results) diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 881cbb7f2b8a..9480b20398d6 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -207,7 +207,7 @@ def _aio_handle_tasklet(pool_params): return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops -def _init_takslet(b): +def _init_tasklet(b): global aio_barrier aio_barrier = b @@ -216,7 +216,7 @@ def aio_handle_multiprocessing(args, read_op): num_processes = len(args.mapping_dict) b = Barrier(num_processes) pool_params = [(args, p, read_op) for p in range(num_processes)] - with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p: + with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p: pool_results = p.map(_aio_handle_tasklet, pool_params) report_results(args, read_op, pool_results) diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp index b8af24f09070..c370a448e5a2 100644 --- a/csrc/gds/py_lib/deepspeed_gds_op.cpp +++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp @@ -12,7 +12,6 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. using namespace std; // For when there is more than 1 device -// static std::set base_buffer_registry; static std::map> base_ptr_registry; static void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle) @@ -110,7 +109,7 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op, char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } -void gds_op_desc_t::fini() { cuFileHandleDeregister(_cf_handle); } +void gds_op_desc_t::finish() { cuFileHandleDeregister(_cf_handle); } void gds_op_desc_t::validate() { diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h index 3ad8b9ecf58d..b7fab64d4054 100644 --- a/csrc/gds/py_lib/deepspeed_gds_op.h +++ b/csrc/gds/py_lib/deepspeed_gds_op.h @@ -34,7 +34,7 @@ struct gds_op_desc_t : io_op_desc_t { void validate(); - void fini(); + void finish(); void _report_error(const ssize_t return_code, const int error_num, const off_t offset); From f6c6f66ea0d9e939552e96cd1169846eefc20795 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 11 Aug 2024 20:15:39 +0000 Subject: [PATCH 19/31] Fix GDS issues --- csrc/aio/py_lib/deepspeed_cpu_op.cpp | 7 +++ csrc/aio/py_lib/deepspeed_py_copy.cpp | 2 +- ...cated_tile_access_iterator_residual_last.h | 8 +-- csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 28 ++++++++-- csrc/gds/py_lib/deepspeed_py_gds_handle.h | 9 +++- csrc/gds/py_lib/py_ds_gds.cpp | 7 ++- csrc/gds/py_test/validate_gds.py | 7 ++- csrc/includes/simd.h | 2 +- csrc/xpu/includes/simd.h | 2 +- csrc/xpu/includes/type_shim.h | 10 ++-- op_builder/async_io.py | 2 +- op_builder/gds.py | 2 +- tests/unit/ops/aio/test_gds.py | 52 ++++++++++++++----- 13 files changed, 101 insertions(+), 37 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp index b3b1932bd70b..41790b99bb88 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp +++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp @@ -39,6 +39,13 @@ void cpu_op_desc_t::finish() { if (_read_op) { if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } + if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); } +#if defined(__ENABLE_CANN__) + if (torch_npu::utils::is_npu(_buffer)) { + auto device = at::Device("npu:0"); + _buffer.copy_(_cpu_buffer.to(device)); + } +#endif } } diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp index 561c46f7c287..f5480e9d9d83 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.cpp +++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp @@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices. #include "deepspeed_py_copy.h" #include -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) or defined(__AVX256__) union AVX_Data { diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h index 7f6a2430845a..dcbdc11c27ad 100644 --- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h +++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h @@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){}; + : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}; }; private: @@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){}; + : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}; }; private: diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp index 859ca19535a4..94b89afb6941 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -46,6 +46,11 @@ void deepspeed_gds_handle_t::_init_cuFile(const int block_size, std::cerr << "Can't open local cufile" << std::endl; exit(EXIT_FAILURE); } + // TODO: Address the following issues with this code + // (1) Fix C++14 warning + // (2) Create file in a different location than PWD + // (3) Handle multi-GPU/multi-rank scenarios: should cufile be shared, is per-rank cufile + // safe? putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json"); cuFileDriverOpen(); cudaCheckError(); @@ -65,16 +70,31 @@ void deepspeed_gds_handle_t::_close_cuFile() if (deepspeed_gds_handle_t::s_cuFile_init == 0) { cuFileDriverClose(); } } -int deepspeed_gds_handle_t::new_device_locked_tensor(const torch::Tensor& buffer) +torch::Tensor deepspeed_gds_handle_t::new_pinned_device_tensor(const size_t num_elem, + const torch::Tensor& example_tensor) +{ + auto options = torch::TensorOptions().dtype(example_tensor.scalar_type()).device(torch::kCUDA); + auto dev_tensor = torch::empty(num_elem, options); + pin_device_tensor(dev_tensor); + return dev_tensor; +} + +bool deepspeed_gds_handle_t::free_pinned_device_tensor(torch::Tensor& buffer) +{ + unpin_device_tensor(buffer); + return true; +} + +bool deepspeed_gds_handle_t::pin_device_tensor(const torch::Tensor& buffer) { gds_op_desc_t::add_buffer_to_registry(buffer); - return 0; + return true; } -int deepspeed_gds_handle_t::free_device_locked_tensor(const torch::Tensor& buffer) +bool deepspeed_gds_handle_t::unpin_device_tensor(const torch::Tensor& buffer) { gds_op_desc_t::remove_buffer_from_registry(buffer); - return 0; + return true; } std::shared_ptr deepspeed_gds_handle_t::_create_io_op_desc( diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h index 0e42b07a49dc..e04f3b54da37 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h @@ -20,9 +20,14 @@ struct deepspeed_gds_handle_t : deepspeed_aio_handle_t { ~deepspeed_gds_handle_t(); - int new_device_locked_tensor(const torch::Tensor& buffer); + torch::Tensor new_pinned_device_tensor(const size_t num_elem, + const torch::Tensor& example_tensor); - int free_device_locked_tensor(const torch::Tensor& buffer); + bool free_pinned_device_tensor(torch::Tensor&); + + bool pin_device_tensor(const torch::Tensor& buffer); + + bool unpin_device_tensor(const torch::Tensor& buffer); void _init_cuFile(const int block_size, const int queue_length, const int num_threads); diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp index 7d1c34e3bcad..10a7da1535ed 100644 --- a/csrc/gds/py_lib/py_ds_gds.cpp +++ b/csrc/gds/py_lib/py_ds_gds.cpp @@ -41,8 +41,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) .def("new_cpu_locked_tensor", &deepspeed_gds_handle_t::new_cpu_locked_tensor) .def("free_cpu_locked_tensor", &deepspeed_gds_handle_t::free_cpu_locked_tensor) - .def("new_device_locked_tensor", &deepspeed_gds_handle_t::new_device_locked_tensor) - .def("free_device_locked_tensor", &deepspeed_gds_handle_t::free_device_locked_tensor) + + .def("new_pinned_device_tensor", &deepspeed_gds_handle_t::new_pinned_device_tensor) + .def("free_pinned_device_tensor", &deepspeed_gds_handle_t::free_pinned_device_tensor) + .def("pin_device_tensor", &deepspeed_gds_handle_t::pin_device_tensor) + .def("unpin_device_tensor", &deepspeed_gds_handle_t::unpin_device_tensor) .def("wait", &deepspeed_gds_handle_t::wait); } diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py index b34b1194f582..ea306f287ae6 100644 --- a/csrc/gds/py_test/validate_gds.py +++ b/csrc/gds/py_test/validate_gds.py @@ -6,5 +6,8 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. """ from deepspeed.ops.op_builder import GDSBuilder -assert GDSBuilder().is_compatible(True) -assert GDSBuilder().load(True) +import pdb + +pdb.set_trace() +assert GDSBuilder().is_compatible(False) +# assert GDSBuilder().load(True) diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h index f5bfb45dd2e2..a205026ec7c1 100644 --- a/csrc/includes/simd.h +++ b/csrc/includes/simd.h @@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val) std::memcpy(dst, &val, sizeof(T)); } -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h index f77568be7835..097e2d8585cc 100644 --- a/csrc/xpu/includes/simd.h +++ b/csrc/xpu/includes/simd.h @@ -13,7 +13,7 @@ #define TILE (128 * 1024 * 1024) #if defined(__AVX512__) or defined(__AVX256__) -#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) +#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h index fa41757c895b..1897afd1fea2 100644 --- a/csrc/xpu/includes/type_shim.h +++ b/csrc/xpu/includes/type_shim.h @@ -82,11 +82,11 @@ } template -__inline__ __attribute__((always_inline)) T reduce_block_into_lanes( - T* x, - T val, - int lanes = 1, - bool share_result = false) // lanes is intended to be <= 32. +__inline__ __attribute__((always_inline)) T +reduce_block_into_lanes(T* x, + T val, + int lanes = 1, + bool share_result = false) // lanes is intended to be <= 32. { auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2); diff --git a/op_builder/async_io.py b/op_builder/async_io.py index a9039def3a40..63c03d0a08ae 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -84,7 +84,7 @@ def check_for_libaio_pkg(self): break return found - def is_compatible(self, verbose=True): + def is_compatible(self, verbose=False): # Check for the existence of libaio by using distutils # to compile and link a test program that calls io_submit, # which is a function provided by libaio that is used in the async_io op. diff --git a/op_builder/gds.py b/op_builder/gds.py index 3b06ca16f40d..e024674e01d8 100644 --- a/op_builder/gds.py +++ b/op_builder/gds.py @@ -35,7 +35,7 @@ def include_paths(self): def extra_ldflags(self): return super().extra_ldflags() + ['-lcufile'] - def is_compatible(self, verbose=True): + def is_compatible(self, verbose=False): import torch.utils.cpp_extension CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64") diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py index 7afa5970d69f..53655994b560 100644 --- a/tests/unit/ops/aio/test_gds.py +++ b/tests/unit/ops/aio/test_gds.py @@ -47,7 +47,7 @@ def _get_test_write_file(tmpdir, index): def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index=0): test_file = _get_test_write_file(tmpdir, index) test_buffer = get_accelerator().ByteTensor(list(ref_buffer)) - gds_handle.new_device_locked_tensor(test_buffer) + gds_handle.pin_device_tensor(test_buffer) return test_file, test_buffer @@ -64,7 +64,6 @@ def _validate_handle_state(handle, single_submit, overlap_events): class TestRead(DistributedTest): world_size = 1 reuse_dist_env = True - requires_cuda_env = False if not get_accelerator().is_available(): init_distributed = False set_dist_env = False @@ -74,7 +73,7 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events): h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) - h.new_device_locked_tensor(gds_buffer) + h.pin_device_tensor(gds_buffer) _validate_handle_state(h, single_submit, overlap_events) @@ -86,14 +85,14 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events): ref_buffer = list(f.read()) assert ref_buffer == gds_buffer.tolist() - h.free_device_locked_tensor(gds_buffer) + h.unpin_device_tensor(gds_buffer) def test_async_read(self, tmpdir, single_submit, overlap_events): h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) - h.new_device_locked_tensor(gds_buffer) + h.pin_device_tensor(gds_buffer) _validate_handle_state(h, single_submit, overlap_events) @@ -108,7 +107,7 @@ def test_async_read(self, tmpdir, single_submit, overlap_events): ref_buffer = list(f.read()) assert ref_buffer == gds_buffer.tolist() - h.free_device_locked_tensor(gds_buffer) + h.unpin_device_tensor(gds_buffer) @pytest.mark.parametrize("single_submit", [True, False]) @@ -116,7 +115,6 @@ def test_async_read(self, tmpdir, single_submit, overlap_events): class TestWrite(DistributedTest): world_size = 1 reuse_dist_env = True - requires_cuda_env = False if not get_accelerator().is_available(): init_distributed = False set_dist_env = False @@ -133,7 +131,7 @@ def test_parallel_write(self, tmpdir, single_submit, overlap_events): write_status = h.sync_pwrite(gds_buffer, gds_file) assert write_status == 1 - h.free_device_locked_tensor(gds_buffer) + h.unpin_device_tensor(gds_buffer) assert os.path.isfile(gds_file) @@ -154,7 +152,7 @@ def test_async_write(self, tmpdir, single_submit, overlap_events): wait_status = h.wait() assert wait_status == 1 - h.free_device_locked_tensor(gds_buffer) + h.unpin_device_tensor(gds_buffer) assert os.path.isfile(gds_file) @@ -165,7 +163,6 @@ def test_async_write(self, tmpdir, single_submit, overlap_events): @pytest.mark.sequential class TestAsyncQueue(DistributedTest): world_size = 1 - requires_cuda_env = False if not get_accelerator().is_available(): init_distributed = False set_dist_env = False @@ -186,7 +183,7 @@ def test_read(self, tmpdir, async_queue): torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) for _ in range(async_queue) ] for buf in gds_buffers: - h.new_device_locked_tensor(buf) + h.pin_device_tensor(buf) _validate_handle_state(h, single_submit, overlap_events) @@ -203,7 +200,7 @@ def test_read(self, tmpdir, async_queue): assert ref_buffer == gds_buffers[i].tolist() for t in gds_buffers: - h.free_device_locked_tensor(t) + h.unpin_device_tensor(t) @pytest.mark.parametrize("async_queue", [2, 3]) def test_write(self, tmpdir, async_queue): @@ -235,10 +232,39 @@ def test_write(self, tmpdir, async_queue): assert wait_status == async_queue for t in gds_buffers: - h.free_device_locked_tensor(t) + h.unpin_device_tensor(t) for i in range(async_queue): assert os.path.isfile(gds_files[i]) filecmp.clear_cache() assert filecmp.cmp(ref_files[i], gds_files[i], shallow=False) + + +@pytest.mark.parametrize("use_new_api", [True, False]) +class TestLockDeviceTensor(DistributedTest): + world_size = 2 + reuse_dist_env = True + if not get_accelerator().is_available(): + init_distributed = False + set_dist_env = False + + def test_pin_device_tensor(self, use_new_api): + + h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL) + + unpinned_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) + if use_new_api: + pinned_buffer = h.new_pinned_device_tensor(unpinned_buffer.numel(), unpinned_buffer) + else: + pinned_buffer = torch.empty_like(unpinned_buffer) + h.pin_device_tensor(pinned_buffer) + + assert unpinned_buffer.device == pinned_buffer.device + assert unpinned_buffer.dtype == pinned_buffer.dtype + assert unpinned_buffer.numel() == pinned_buffer.numel() + + if use_new_api: + h.free_pinned_device_tensor(pinned_buffer) + else: + h.unpin_device_tensor(pinned_buffer) From 8baf546f096fd169c9c11d7057f9b17ab3a67b2c Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sun, 11 Aug 2024 21:08:39 -0400 Subject: [PATCH 20/31] Formatting fixes --- csrc/aio/common/deepspeed_aio_common.cpp | 5 ++--- csrc/aio/py_lib/deepspeed_py_aio.cpp | 10 ++++------ csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 10 ++++------ .../evoformer_attn/gemm_kernel_utils.h | 9 ++++----- .../predicated_tile_access_iterator_residual_last.h | 8 ++++---- 5 files changed, 18 insertions(+), 24 deletions(-) diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp index 0f2895dfa328..a65cc500cc82 100644 --- a/csrc/aio/common/deepspeed_aio_common.cpp +++ b/csrc/aio/common/deepspeed_aio_common.cpp @@ -301,9 +301,8 @@ int regular_read(const char* filename, std::vector& buffer) } while (r > 0); if (read_bytes != num_bytes) { - std::cerr << "read error " - << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes - << std::endl; + std::cerr << "read error " << " read_bytes (read) = " << read_bytes + << " num_bytes (fstat) = " << num_bytes << std::endl; } assert(read_bytes == num_bytes); close(fd); diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp index 30b6682ada72..eac268d33433 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp @@ -69,9 +69,8 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } @@ -115,8 +114,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index 8f2c82f86968..1ed3645276bd 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -91,9 +91,8 @@ int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, co if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); } const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } @@ -126,9 +125,8 @@ int deepspeed_aio_handle_t::write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " - << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 - << std::endl; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; return 0; } diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h index 2a4300c5cac1..c102234a4dfb 100644 --- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h +++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h @@ -125,11 +125,10 @@ struct CheckArch { std::cerr << #PTR " is not correctly aligned\n"; \ return false; \ } -#define EVOFORMER_CHECK(COND, ERR) \ - if (!(COND)) { \ - std::cerr << "[Evoformer Attention]" \ - << "'" #COND "' failed: " << ERR << "\n"; \ - return false; \ +#define EVOFORMER_CHECK(COND, ERR) \ + if (!(COND)) { \ + std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \ + return false; \ } #endif diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h index dcbdc11c27ad..7f6a2430845a 100644 --- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h +++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h @@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}; + : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){}; }; private: @@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast tensor's layout CUTLASS_HOST_DEVICE Params(Layout const& layout) - : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}; + : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){}; }; private: From 101a43d3b323427b107dc5c6d0c442ef6ad34869 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 12 Aug 2024 10:43:45 -0400 Subject: [PATCH 21/31] Avoid xpu regression (#4130) --- op_builder/async_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/op_builder/async_io.py b/op_builder/async_io.py index 63c03d0a08ae..63fc2c94529c 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -90,7 +90,7 @@ def is_compatible(self, verbose=False): # which is a function provided by libaio that is used in the async_io op. # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS # respectively to specify the directories for libaio.h and libaio.so. - aio_compatible = self.has_function('io_submit', ('aio', )) + aio_compatible = self.has_function('io_pgetevents', ('aio', )) if verbose and not aio_compatible: self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.") From 76f8e1ce63f78b0f23878b065fe64f5c61193c25 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Aug 2024 20:30:22 +0000 Subject: [PATCH 22/31] microbenchmark updates --- csrc/aio/py_test/ds_aio_handle.py | 2 +- csrc/aio/py_test/run_read_sweep.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index a9c5a9d207d7..96c72f08027b 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -43,7 +43,7 @@ def pre_handle(args, tid, read_op): io_parallel = args.io_parallel if args.io_parallel else 1 if gds: - handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel) + handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel) handle.new_device_locked_tensor(buffer) else: handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel) diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 7c2cb46f83fc..59d82996a0e2 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -95,7 +95,7 @@ for xtype in cpu gpu gds; do for t in 1 2 4 8; do for d in 8 16 32 64 128; do for bs in 128K 256K 512K 1M 2M 4M 8M 16M; do - SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme03:0" + SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme01:0" OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}" LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt" cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}" From 6c464031edbd2c01e7643f331342a3b6c0bd9df5 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 15 Aug 2024 11:15:11 -0400 Subject: [PATCH 23/31] Avoid passing class member num_threads --- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 9 +++------ csrc/aio/py_lib/deepspeed_py_aio_handle.h | 1 - csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 5 ++--- csrc/gds/py_lib/deepspeed_py_gds_handle.h | 1 - 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index 1ed3645276bd..9c560621dc63 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -207,11 +207,10 @@ std::shared_ptr deepspeed_aio_handle_t::_create_io_op_desc( const int fd, const char* filename, const long long int file_num_bytes, - const int num_threads, const bool validate) { return std::make_shared( - read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); + read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate); } int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer, @@ -238,8 +237,7 @@ int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer, const auto fd = open_file(filename, true); if (fd == -1) { return -1; } - auto scheduled_op = - _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, _num_threads, validate); + auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate); _schedule_aio_work(scheduled_op); @@ -261,8 +259,7 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer, const auto fd = open_file(filename, false); if (fd == -1) { return -1; } - auto scheduled_op = - _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, _num_threads, validate); + auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate); _schedule_aio_work(scheduled_op); diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h index c89998974743..8f1d8f5d0514 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h @@ -81,6 +81,5 @@ struct deepspeed_aio_handle_t { const int fd, const char* filename, const long long int file_num_bytes, - const int num_threads, const bool validate); }; diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp index 94b89afb6941..c58b6da405ff 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -103,13 +103,12 @@ std::shared_ptr deepspeed_gds_handle_t::_create_io_op_desc( const int fd, const char* filename, const long long int file_num_bytes, - const int num_threads, const bool validate) { if (buffer.is_cuda()) { return std::make_shared( - read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); + read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate); } return deepspeed_aio_handle_t::_create_io_op_desc( - read_op, buffer, fd, filename, file_num_bytes, num_threads, validate); + read_op, buffer, fd, filename, file_num_bytes, validate); } diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h index e04f3b54da37..2d53b0efc190 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h @@ -38,7 +38,6 @@ struct deepspeed_gds_handle_t : deepspeed_aio_handle_t { const int fd, const char* filename, const long long int file_num_bytes, - const int num_threads, const bool validate); static int s_cuFile_init; From d0a222cd51e1657594f23e163d490aab4f6cc0b8 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 15 Aug 2024 11:53:57 -0400 Subject: [PATCH 24/31] Create abstract base class --- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 280 +----------------- csrc/aio/py_lib/deepspeed_py_aio_handle.h | 67 +---- csrc/aio/py_lib/deepspeed_py_io_handle.cpp | 300 ++++++++++++++++++++ csrc/aio/py_lib/deepspeed_py_io_handle.h | 85 ++++++ csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 4 +- csrc/gds/py_lib/deepspeed_py_gds_handle.h | 4 +- op_builder/async_io.py | 13 +- 7 files changed, 400 insertions(+), 353 deletions(-) create mode 100644 csrc/aio/py_lib/deepspeed_py_io_handle.cpp create mode 100644 csrc/aio/py_lib/deepspeed_py_io_handle.h diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index 9c560621dc63..c7ca5e82afde 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -12,289 +12,13 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. using namespace std; -static void _start_aio_thread(std::shared_ptr ctxt) { ctxt->run(); } - deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, const int queue_depth, const bool single_submit, const bool overlap_events, const int num_threads) - : _aio_ctxt(new aio_context(block_size, queue_depth)), - _single_submit(single_submit), - _overlap_events(overlap_events), - _num_threads(num_threads), - _aio_config(block_size, queue_depth, single_submit, overlap_events, false), - _num_pending_ops(0), - _pinned_tensor_mgr(new deepspeed_pin_tensor_t()) -{ - for (auto i = 0; i < num_threads; ++i) { - _thread_contexts.push_back(std::make_shared(i, _aio_config)); - } - - for (auto& ctxt : _thread_contexts) { - _threads.push_back(std::thread(_start_aio_thread, ctxt)); - } -} - -deepspeed_aio_handle_t::~deepspeed_aio_handle_t() -{ - _stop_threads(); - for (auto& thr : _threads) { thr.join(); } -} - -const int deepspeed_aio_handle_t::get_block_size() const -{ - return _aio_ctxt ? _aio_ctxt->_block_size : -1; -} - -const int deepspeed_aio_handle_t::get_queue_depth() const -{ - return _aio_ctxt ? _aio_ctxt->_queue_depth : -1; -} - -const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; } - -const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; } - -const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; } - -int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate) -{ - const auto start_time = std::chrono::high_resolution_clock::now(); - - assert(_aio_ctxt); - - long long num_file_bytes; - if (-1 == get_file_size(filename, num_file_bytes)) { - const auto error_code = errno; - report_file_error(filename, " fstat for read", error_code); - return -1; - } - assert(static_cast(buffer.nbytes()) == num_file_bytes); - - const auto fd = open_file(filename, true); - if (fd == -1) { return -1; } - - auto read_buffer = (char*)buffer.data_ptr(); - std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer)); - - if (_aio_config._overlap_events) { - do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); - } else { - do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); - } - - close(fd); - const std::chrono::duration aio_time = - std::chrono::high_resolution_clock::now() - start_time; - - if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); } - const std::chrono::duration fn_time = - std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; - return 0; -} - -int deepspeed_aio_handle_t::write(const torch::Tensor& buffer, - const char* filename, - const bool validate) -{ - assert(_aio_ctxt); - - const auto start_time = std::chrono::high_resolution_clock::now(); - - const auto fd = open_file(filename, false); - if (fd == -1) { return -1; } - - auto write_buffer = (char*)buffer.data_ptr(); - const auto num_write_bytes = static_cast(buffer.nbytes()); - std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer)); - - if (_aio_config._overlap_events) { - do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); - } else { - do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); - } - const std::chrono::duration aio_time = - std::chrono::high_resolution_clock::now() - start_time; - - close(fd); - - if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); } - - const std::chrono::duration fn_time = - std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; - return 0; -} - -void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr scheduled_op) -{ - for (auto& ctxt : _thread_contexts) { - { - std::lock_guard lock(ctxt->_work_sync._mutex); - ctxt->_work_queue.push(scheduled_op); - } - ctxt->_work_sync._cond_var.notify_one(); - } - _num_pending_ops++; -} - -std::shared_ptr deepspeed_aio_handle_t::_wait_for_aio_work() -{ - std::shared_ptr completed_op = nullptr; - for (auto& ctxt : _thread_contexts) { - std::unique_lock lock(ctxt->_complete_sync._mutex); - ctxt->_complete_sync._cond_var.wait(lock, - [ctxt] { return !ctxt->_complete_queue.empty(); }); - completed_op = ctxt->_complete_queue.front(); - ctxt->_complete_queue.pop(); - } - return completed_op; -} - -void deepspeed_aio_handle_t::_stop_threads() -{ - assert(0 == _num_pending_ops); - for (auto& ctxt : _thread_contexts) { - { - std::lock_guard lock(ctxt->_work_sync._mutex); - ctxt->_time_to_exit = true; - } - ctxt->_work_sync._cond_var.notify_one(); - } -} - -int deepspeed_aio_handle_t::wait() -{ - assert(_num_pending_ops > 0); - auto num_completed_ops = 0; - - while (_num_pending_ops > 0) { - auto completed_op = _wait_for_aio_work(); - - if (completed_op->_validate) { completed_op->validate(); } - - completed_op->finish(); - - close(completed_op->_fd); - - --_num_pending_ops; - ++num_completed_ops; - } - - return num_completed_ops; -} - -bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op, - const long long int num_bytes) -{ - const auto op_string = read_op ? "Read" : "Write"; - if (num_bytes % get_thread_count()) { - std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes - << " not divisible by thread count = " << get_thread_count() << std::endl; - return false; - } - - return true; -} - -std::shared_ptr deepspeed_aio_handle_t::_create_io_op_desc( - const bool read_op, - const torch::Tensor& buffer, - const int fd, - const char* filename, - const long long int file_num_bytes, - const bool validate) + : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads) { - return std::make_shared( - read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate); } -int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer, - const char* filename, - const bool validate, - const bool async) -{ - long long num_file_bytes; - if (-1 == get_file_size(filename, num_file_bytes)) { - const auto error_code = errno; - report_file_error(filename, " fstat for read", error_code); - return -1; - } - const auto buffer_bytes = static_cast(buffer.nbytes()); - if (buffer_bytes != num_file_bytes) { - std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes - << " != " << num_file_bytes << std::endl; - } - assert(static_cast(buffer.nbytes()) == num_file_bytes); - assert((num_file_bytes % _num_threads) == 0); - - if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; } - - const auto fd = open_file(filename, true); - if (fd == -1) { return -1; } - - auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate); - - _schedule_aio_work(scheduled_op); - - if (async) { return 0; } - - return wait(); -} - -int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer, - const char* filename, - const bool validate, - const bool async) -{ - const auto num_write_bytes = static_cast(buffer.nbytes()); - assert((num_write_bytes % _num_threads) == 0); - - if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; } - - const auto fd = open_file(filename, false); - if (fd == -1) { return -1; } - - auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate); - - _schedule_aio_work(scheduled_op); - - if (async) { return 0; } - - return wait(); -} - -int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename) -{ - return pread(buffer, filename, false, false); -} - -int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename) -{ - return pwrite(buffer, filename, false, false); -} - -int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename) -{ - return pread(buffer, filename, false, true); -} - -int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename) -{ - return pwrite(buffer, filename, false, true); -} - -at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem, - const torch::Tensor& example_tensor) -{ - return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type()); -} - -bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor) -{ - return _pinned_tensor_mgr->free(locked_tensor); -} +deepspeed_aio_handle_t::~deepspeed_aio_handle_t() {} diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h index 8f1d8f5d0514..eb6b90ea22f0 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h @@ -9,21 +9,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. #include #include -#include "deepspeed_aio_thread.h" -#include "deepspeed_pin_tensor.h" - -struct deepspeed_aio_handle_t { - std::unique_ptr _aio_ctxt; - const bool _single_submit; - const bool _overlap_events; - const int _num_threads; - deepspeed_aio_config_t _aio_config; - - std::vector> _thread_contexts; - std::vector _threads; - int _num_pending_ops; - std::unique_ptr _pinned_tensor_mgr; +#include "deepspeed_py_io_handle.h" +struct deepspeed_aio_handle_t : deepspeed_io_handle_t { deepspeed_aio_handle_t(const int block_size, const int queue_depth, const bool single_submit, @@ -31,55 +19,4 @@ struct deepspeed_aio_handle_t { const int num_threads); ~deepspeed_aio_handle_t(); - - const int get_block_size() const; - const int get_queue_depth() const; - const bool get_single_submit() const; - const bool get_overlap_events() const; - const int get_thread_count() const; - - int read(torch::Tensor& buffer, const char* filename, const bool validate); - - int write(const torch::Tensor& buffer, const char* filename, const bool validate); - - int pread(const torch::Tensor& buffer, - const char* filename, - const bool validate, - const bool async); - - int pwrite(const torch::Tensor& buffer, - const char* filename, - const bool validate, - const bool async); - - int sync_pread(torch::Tensor& buffer, const char* filename); - - int sync_pwrite(const torch::Tensor& buffer, const char* filename); - - int async_pread(torch::Tensor& buffer, const char* filename); - - int async_pwrite(const torch::Tensor& buffer, const char* filename); - - // TODO: Make API's args to be shape and dtype. - torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor); - - bool free_cpu_locked_tensor(torch::Tensor&); - - int wait(); - - void _stop_threads(); - - void _schedule_aio_work(std::shared_ptr scheduled_op); - - std::shared_ptr _wait_for_aio_work(); - - bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes); - - virtual std::shared_ptr _create_io_op_desc( - const bool read_op, - const torch::Tensor& buffer, - const int fd, - const char* filename, - const long long int file_num_bytes, - const bool validate); }; diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp new file mode 100644 index 000000000000..c31ca3040ee2 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp @@ -0,0 +1,300 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +/* +Functionality for swapping optimizer tensors to/from (NVMe) storage devices. +*/ + +#include "deepspeed_py_io_handle.h" +#include + +using namespace std; + +static void _start_aio_thread(std::shared_ptr ctxt) { ctxt->run(); } + +deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size, + const int queue_depth, + const bool single_submit, + const bool overlap_events, + const int num_threads) + : _aio_ctxt(new aio_context(block_size, queue_depth)), + _single_submit(single_submit), + _overlap_events(overlap_events), + _num_threads(num_threads), + _aio_config(block_size, queue_depth, single_submit, overlap_events, false), + _num_pending_ops(0), + _pinned_tensor_mgr(new deepspeed_pin_tensor_t()) +{ + for (auto i = 0; i < num_threads; ++i) { + _thread_contexts.push_back(std::make_shared(i, _aio_config)); + } + + for (auto& ctxt : _thread_contexts) { + _threads.push_back(std::thread(_start_aio_thread, ctxt)); + } +} + +deepspeed_io_handle_t::~deepspeed_io_handle_t() +{ + _stop_threads(); + for (auto& thr : _threads) { thr.join(); } +} + +const int deepspeed_io_handle_t::get_block_size() const +{ + return _aio_ctxt ? _aio_ctxt->_block_size : -1; +} + +const int deepspeed_io_handle_t::get_queue_depth() const +{ + return _aio_ctxt ? _aio_ctxt->_queue_depth : -1; +} + +const bool deepspeed_io_handle_t::get_single_submit() const { return _single_submit; } + +const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_events; } + +const int deepspeed_io_handle_t::get_thread_count() const { return _num_threads; } + +int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate) +{ + const auto start_time = std::chrono::high_resolution_clock::now(); + + assert(_aio_ctxt); + + long long num_file_bytes; + if (-1 == get_file_size(filename, num_file_bytes)) { + const auto error_code = errno; + report_file_error(filename, " fstat for read", error_code); + return -1; + } + assert(static_cast(buffer.nbytes()) == num_file_bytes); + + const auto fd = open_file(filename, true); + if (fd == -1) { return -1; } + + auto read_buffer = (char*)buffer.data_ptr(); + std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer)); + + if (_aio_config._overlap_events) { + do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); + } else { + do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); + } + + close(fd); + const std::chrono::duration aio_time = + std::chrono::high_resolution_clock::now() - start_time; + + if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); } + const std::chrono::duration fn_time = + std::chrono::high_resolution_clock::now() - start_time; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; + return 0; +} + +int deepspeed_io_handle_t::write(const torch::Tensor& buffer, + const char* filename, + const bool validate) +{ + assert(_aio_ctxt); + + const auto start_time = std::chrono::high_resolution_clock::now(); + + const auto fd = open_file(filename, false); + if (fd == -1) { return -1; } + + auto write_buffer = (char*)buffer.data_ptr(); + const auto num_write_bytes = static_cast(buffer.nbytes()); + std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer)); + + if (_aio_config._overlap_events) { + do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); + } else { + do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); + } + const std::chrono::duration aio_time = + std::chrono::high_resolution_clock::now() - start_time; + + close(fd); + + if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); } + + const std::chrono::duration fn_time = + std::chrono::high_resolution_clock::now() - start_time; + std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 + << " call = " << fn_time.count() * 1e6 << std::endl; + return 0; +} + +void deepspeed_io_handle_t::_schedule_aio_work(std::shared_ptr scheduled_op) +{ + for (auto& ctxt : _thread_contexts) { + { + std::lock_guard lock(ctxt->_work_sync._mutex); + ctxt->_work_queue.push(scheduled_op); + } + ctxt->_work_sync._cond_var.notify_one(); + } + _num_pending_ops++; +} + +std::shared_ptr deepspeed_io_handle_t::_wait_for_aio_work() +{ + std::shared_ptr completed_op = nullptr; + for (auto& ctxt : _thread_contexts) { + std::unique_lock lock(ctxt->_complete_sync._mutex); + ctxt->_complete_sync._cond_var.wait(lock, + [ctxt] { return !ctxt->_complete_queue.empty(); }); + completed_op = ctxt->_complete_queue.front(); + ctxt->_complete_queue.pop(); + } + return completed_op; +} + +void deepspeed_io_handle_t::_stop_threads() +{ + assert(0 == _num_pending_ops); + for (auto& ctxt : _thread_contexts) { + { + std::lock_guard lock(ctxt->_work_sync._mutex); + ctxt->_time_to_exit = true; + } + ctxt->_work_sync._cond_var.notify_one(); + } +} + +int deepspeed_io_handle_t::wait() +{ + assert(_num_pending_ops > 0); + auto num_completed_ops = 0; + + while (_num_pending_ops > 0) { + auto completed_op = _wait_for_aio_work(); + + if (completed_op->_validate) { completed_op->validate(); } + + completed_op->finish(); + + close(completed_op->_fd); + + --_num_pending_ops; + ++num_completed_ops; + } + + return num_completed_ops; +} + +bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op, + const long long int num_bytes) +{ + const auto op_string = read_op ? "Read" : "Write"; + if (num_bytes % get_thread_count()) { + std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes + << " not divisible by thread count = " << get_thread_count() << std::endl; + return false; + } + + return true; +} + +std::shared_ptr deepspeed_io_handle_t::_create_io_op_desc( + const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const bool validate) +{ + return std::make_shared( + read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate); +} + +int deepspeed_io_handle_t::pread(const torch::Tensor& buffer, + const char* filename, + const bool validate, + const bool async) +{ + long long num_file_bytes; + if (-1 == get_file_size(filename, num_file_bytes)) { + const auto error_code = errno; + report_file_error(filename, " fstat for read", error_code); + return -1; + } + const auto buffer_bytes = static_cast(buffer.nbytes()); + if (buffer_bytes != num_file_bytes) { + std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes + << " != " << num_file_bytes << std::endl; + } + assert(static_cast(buffer.nbytes()) == num_file_bytes); + assert((num_file_bytes % _num_threads) == 0); + + if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; } + + const auto fd = open_file(filename, true); + if (fd == -1) { return -1; } + + auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate); + + _schedule_aio_work(scheduled_op); + + if (async) { return 0; } + + return wait(); +} + +int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer, + const char* filename, + const bool validate, + const bool async) +{ + const auto num_write_bytes = static_cast(buffer.nbytes()); + assert((num_write_bytes % _num_threads) == 0); + + if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; } + + const auto fd = open_file(filename, false); + if (fd == -1) { return -1; } + + auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate); + + _schedule_aio_work(scheduled_op); + + if (async) { return 0; } + + return wait(); +} + +int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer, const char* filename) +{ + return pread(buffer, filename, false, false); +} + +int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename) +{ + return pwrite(buffer, filename, false, false); +} + +int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer, const char* filename) +{ + return pread(buffer, filename, false, true); +} + +int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename) +{ + return pwrite(buffer, filename, false, true); +} + +at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const size_t num_elem, + const torch::Tensor& example_tensor) +{ + return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type()); +} + +bool deepspeed_io_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor) +{ + return _pinned_tensor_mgr->free(locked_tensor); +} diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h new file mode 100644 index 000000000000..b9894fe36416 --- /dev/null +++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +/* +Functionality for swapping optimizer tensors to/from (NVMe) storage devices. +*/ + +#include +#include +#include "deepspeed_aio_thread.h" +#include "deepspeed_pin_tensor.h" + +struct deepspeed_io_handle_t { + std::unique_ptr _aio_ctxt; + const bool _single_submit; + const bool _overlap_events; + const int _num_threads; + deepspeed_aio_config_t _aio_config; + + std::vector> _thread_contexts; + std::vector _threads; + int _num_pending_ops; + std::unique_ptr _pinned_tensor_mgr; + + deepspeed_io_handle_t(const int block_size, + const int queue_depth, + const bool single_submit, + const bool overlap_events, + const int num_threads); + + virtual ~deepspeed_io_handle_t() = 0; + + const int get_block_size() const; + const int get_queue_depth() const; + const bool get_single_submit() const; + const bool get_overlap_events() const; + const int get_thread_count() const; + + int read(torch::Tensor& buffer, const char* filename, const bool validate); + + int write(const torch::Tensor& buffer, const char* filename, const bool validate); + + int pread(const torch::Tensor& buffer, + const char* filename, + const bool validate, + const bool async); + + int pwrite(const torch::Tensor& buffer, + const char* filename, + const bool validate, + const bool async); + + int sync_pread(torch::Tensor& buffer, const char* filename); + + int sync_pwrite(const torch::Tensor& buffer, const char* filename); + + int async_pread(torch::Tensor& buffer, const char* filename); + + int async_pwrite(const torch::Tensor& buffer, const char* filename); + + // TODO: Make API's args to be shape and dtype. + torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor); + + bool free_cpu_locked_tensor(torch::Tensor&); + + int wait(); + + void _stop_threads(); + + void _schedule_aio_work(std::shared_ptr scheduled_op); + + std::shared_ptr _wait_for_aio_work(); + + bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes); + + virtual std::shared_ptr _create_io_op_desc( + const bool read_op, + const torch::Tensor& buffer, + const int fd, + const char* filename, + const long long int file_num_bytes, + const bool validate); +}; diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp index c58b6da405ff..3a35ad3145a0 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -20,7 +20,7 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size, const bool single_submit, const bool overlap_events, const int num_threads) - : deepspeed_aio_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads) + : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads) { _init_cuFile(block_size, queue_depth, num_threads); } @@ -109,6 +109,6 @@ std::shared_ptr deepspeed_gds_handle_t::_create_io_op_desc( return std::make_shared( read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate); } - return deepspeed_aio_handle_t::_create_io_op_desc( + return deepspeed_io_handle_t::_create_io_op_desc( read_op, buffer, fd, filename, file_num_bytes, validate); } diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h index 2d53b0efc190..f324e6b65e80 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h @@ -9,9 +9,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. #include #include -#include "deepspeed_py_aio_handle.h" +#include "deepspeed_py_io_handle.h" -struct deepspeed_gds_handle_t : deepspeed_aio_handle_t { +struct deepspeed_gds_handle_t : deepspeed_io_handle_t { deepspeed_gds_handle_t(const int block_size, const int queue_depth, const bool single_submit, diff --git a/op_builder/async_io.py b/op_builder/async_io.py index 63fc2c94529c..08413517b134 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -22,11 +22,12 @@ def absolute_name(self): def lib_sources(self): src_list = [ - 'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', - 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp', - 'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp', - 'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', - 'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' + 'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp', + 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', + 'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp', + 'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp', + 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp', + 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' ] return src_list @@ -90,7 +91,7 @@ def is_compatible(self, verbose=False): # which is a function provided by libaio that is used in the async_io op. # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS # respectively to specify the directories for libaio.h and libaio.so. - aio_compatible = self.has_function('io_pgetevents', ('aio', )) + aio_compatible = self.has_function('io_submit', ('aio', )) if verbose and not aio_compatible: self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.") From 01f74571874ce4419241d56bc876e7ed6a18ffa0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 16:24:08 +0000 Subject: [PATCH 25/31] Remove pdb --- csrc/gds/py_test/validate_gds.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py index ea306f287ae6..b34b1194f582 100644 --- a/csrc/gds/py_test/validate_gds.py +++ b/csrc/gds/py_test/validate_gds.py @@ -6,8 +6,5 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. """ from deepspeed.ops.op_builder import GDSBuilder -import pdb - -pdb.set_trace() -assert GDSBuilder().is_compatible(False) -# assert GDSBuilder().load(True) +assert GDSBuilder().is_compatible(True) +assert GDSBuilder().load(True) From ccc7d18323409541774f3f0e0c25eb196baba3e4 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 15 Aug 2024 15:00:18 -0400 Subject: [PATCH 26/31] Formatting --- csrc/aio/py_lib/deepspeed_py_io_handle.cpp | 28 +++++++++++----------- csrc/aio/py_lib/deepspeed_py_io_handle.h | 8 +++---- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp index c31ca3040ee2..bdf2a858d797 100644 --- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp @@ -15,10 +15,10 @@ using namespace std; static void _start_aio_thread(std::shared_ptr ctxt) { ctxt->run(); } deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size, - const int queue_depth, - const bool single_submit, - const bool overlap_events, - const int num_threads) + const int queue_depth, + const bool single_submit, + const bool overlap_events, + const int num_threads) : _aio_ctxt(new aio_context(block_size, queue_depth)), _single_submit(single_submit), _overlap_events(overlap_events), @@ -97,8 +97,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con } int deepspeed_io_handle_t::write(const torch::Tensor& buffer, - const char* filename, - const bool validate) + const char* filename, + const bool validate) { assert(_aio_ctxt); @@ -189,7 +189,7 @@ int deepspeed_io_handle_t::wait() } bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op, - const long long int num_bytes) + const long long int num_bytes) { const auto op_string = read_op ? "Read" : "Write"; if (num_bytes % get_thread_count()) { @@ -214,9 +214,9 @@ std::shared_ptr deepspeed_io_handle_t::_create_io_op_desc( } int deepspeed_io_handle_t::pread(const torch::Tensor& buffer, - const char* filename, - const bool validate, - const bool async) + const char* filename, + const bool validate, + const bool async) { long long num_file_bytes; if (-1 == get_file_size(filename, num_file_bytes)) { @@ -247,9 +247,9 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer, } int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer, - const char* filename, - const bool validate, - const bool async) + const char* filename, + const bool validate, + const bool async) { const auto num_write_bytes = static_cast(buffer.nbytes()); assert((num_write_bytes % _num_threads) == 0); @@ -289,7 +289,7 @@ int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char* } at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const size_t num_elem, - const torch::Tensor& example_tensor) + const torch::Tensor& example_tensor) { return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type()); } diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h index b9894fe36416..2974ebe87bfc 100644 --- a/csrc/aio/py_lib/deepspeed_py_io_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h @@ -25,10 +25,10 @@ struct deepspeed_io_handle_t { std::unique_ptr _pinned_tensor_mgr; deepspeed_io_handle_t(const int block_size, - const int queue_depth, - const bool single_submit, - const bool overlap_events, - const int num_threads); + const int queue_depth, + const bool single_submit, + const bool overlap_events, + const int num_threads); virtual ~deepspeed_io_handle_t() = 0; From 5043f43a476ff1ffbc5b443485ed56b698724cfd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 19:12:39 +0000 Subject: [PATCH 27/31] updating sweep script with gds --- csrc/aio/py_test/aio_bench_perf_sweep.py | 9 ++++++--- csrc/aio/py_test/ds_aio_handle.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py index 5d4172066a6b..24339ea5a54e 100644 --- a/csrc/aio/py_test/aio_bench_perf_sweep.py +++ b/csrc/aio/py_test/aio_bench_perf_sweep.py @@ -38,10 +38,11 @@ def __init__(self, args): self.write = not args.no_write self.flush_cache = not args.no_sudo self.log_dir = args.log_dir + self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}' if args.gpu: - self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size} --gpu' - else: - self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}' + self.other_options += ' --gpu' + if args.gds: + self.other_options += ' --use_gds' def parse_arguments(): @@ -65,6 +66,8 @@ def parse_arguments(): parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.') + parser.add_argument('--gds', action='store_true', help='Run the sweep over NVIDIA GPUDirectStorage operator') + parser.add_argument( '--no_sudo', action='store_true', diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 721daa96a91a..0b8a58ab1c5c 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -44,7 +44,7 @@ def pre_handle(args, tid, read_op): io_parallel = args.io_parallel if args.io_parallel else 1 if gds: handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel) - handle.new_device_locked_tensor(buffer) + handle.pin_device_tensor(buffer) else: handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel) @@ -79,7 +79,7 @@ def post_handle(pool_params): for buf in [BUFFER, BOUNCE_BUFFER]: if ctxt[buf] is not None: if ctxt['gds']: - ctxt['handle'].free_device_locked_tensor(ctxt[buf]) + ctxt['handle'].unpin_device_tensor(ctxt[buf]) ctxt[buf].detach() ctxt[buf] = None return ctxt From 4833c03ab9d89f5a6d31cef19087398614c49143 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 15 Aug 2024 15:52:23 -0700 Subject: [PATCH 28/31] Formatting --- csrc/aio/py_test/ds_aio_handle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 0b8a58ab1c5c..f4a179deb9ec 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -43,7 +43,8 @@ def pre_handle(args, tid, read_op): io_parallel = args.io_parallel if args.io_parallel else 1 if gds: - handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel) + handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit, + not args.sequential_requests, io_parallel) handle.pin_device_tensor(buffer) else: handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, From 3b400423c92c967e4e0a59cada997de47cd2cb0c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 23:35:14 +0000 Subject: [PATCH 29/31] cpp std based on torch version --- op_builder/builder.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/op_builder/builder.py b/op_builder/builder.py index ca4b339e2447..3f962d3dfee3 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -746,10 +746,15 @@ def hipify_extension(self): ) def cxx_args(self): + TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2]) + if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1: + CPP_STD = '-std=c++17' + else: + CPP_STD = '-std=c++14' if sys.platform == "win32": return ['-O2'] else: - return ['-O3', '-std=c++17', '-g', '-Wno-reorder'] + return ['-O3', CPP_STD, '-g', '-Wno-reorder'] def nvcc_args(self): if self.build_for_cpu: From a75010b101defb87fb8ca65a4bd5ce0e63383e82 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Aug 2024 00:57:39 +0000 Subject: [PATCH 30/31] moving torch check to aio only --- op_builder/async_io.py | 5 +++++ op_builder/builder.py | 7 +------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/op_builder/async_io.py b/op_builder/async_io.py index 08413517b134..e7f16adbf2a3 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -51,6 +51,11 @@ def include_paths(self): def cxx_args(self): # -O0 for improved debugging, since performance is bound by I/O args = super().cxx_args() + import torch + TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2]) + if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1): + args.remove('-std=c++17') + args.append('-std=c++14') args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder'] return args diff --git a/op_builder/builder.py b/op_builder/builder.py index 3f962d3dfee3..ca4b339e2447 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -746,15 +746,10 @@ def hipify_extension(self): ) def cxx_args(self): - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2]) - if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1: - CPP_STD = '-std=c++17' - else: - CPP_STD = '-std=c++14' if sys.platform == "win32": return ['-O2'] else: - return ['-O3', CPP_STD, '-g', '-Wno-reorder'] + return ['-O3', '-std=c++17', '-g', '-Wno-reorder'] def nvcc_args(self): if self.build_for_cpu: From 9a36d8be4b7de406f223a271c25625e2b276a510 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 16 Aug 2024 06:19:09 -0400 Subject: [PATCH 31/31] Usability improvements --- csrc/aio/py_lib/py_ds_aio.cpp | 87 ++++++++++++++---- csrc/aio/py_test/aio_bench_perf_sweep.py | 11 ++- csrc/aio/py_test/ds_aio_args.py | 2 +- csrc/gds/py_lib/py_ds_gds.cpp | 111 +++++++++++++++++++---- 4 files changed, 169 insertions(+), 42 deletions(-) mode change 100755 => 100644 csrc/aio/py_lib/py_ds_aio.cpp diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp old mode 100755 new mode 100644 index 78e91c1607b1..3171d0c6bf3c --- a/csrc/aio/py_lib/py_ds_aio.cpp +++ b/csrc/aio/py_lib/py_ds_aio.cpp @@ -23,11 +23,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) py::class_(m, "aio_handle") .def(py::init(), "AIO handle constructor", - "block_size"_a, - "queue_depth"_a, - "single_submit"_a, - "overlap_events"_a, - "num_threads"_a) + "block_size"_a = 1024 * 1024, + "queue_depth"_a = 128, + "single_submit"_a = false, + "overlap_events"_a = false, + "num_threads"_a = 1) .def("get_block_size", &deepspeed_aio_handle_t::get_block_size) .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth) @@ -35,19 +35,74 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events) .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count) - .def("read", &deepspeed_aio_handle_t::read) - .def("write", &deepspeed_aio_handle_t::write) + .def("read", + &deepspeed_aio_handle_t::read, + "Synchronous and non-parallel file read. Returns count of completed read ops", + "buffer"_a, + "filename"_a, + "validate"_a) - .def("pread", &deepspeed_aio_handle_t::pread) - .def("pwrite", &deepspeed_aio_handle_t::pwrite) + .def("write", + &deepspeed_aio_handle_t::write, + "Synchronous and non-parallel file write. Returns count of completed write ops", + "buffer"_a, + "filename"_a, + "validate"_a) - .def("sync_pread", &deepspeed_aio_handle_t::sync_pread) - .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite) - .def("async_pread", &deepspeed_aio_handle_t::async_pread) - .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite) + .def("pread", + &deepspeed_aio_handle_t::pread, + "Parallel file read with option of parallelism. Returns count of completed read ops", + "buffer"_a, + "filename"_a, + "validate"_a, + "async"_a) - .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor) - .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor) + .def("pwrite", + &deepspeed_aio_handle_t::pwrite, + "Parallel file write with option of parallelism. Returns count of completed write ops", + "buffer"_a, + "filename"_a, + "validate"_a, + "async"_a) - .def("wait", &deepspeed_aio_handle_t::wait); + .def("sync_pread", + &deepspeed_aio_handle_t::sync_pread, + "Synchrononous parallel file read. Returns count of completed read ops", + "buffer"_a, + "filename"_a) + + .def("sync_pwrite", + &deepspeed_aio_handle_t::sync_pwrite, + "Synchronous parallel file write. Returns count of completed write ops", + "buffer"_a, + "filename"_a) + + .def("async_pread", + &deepspeed_aio_handle_t::async_pread, + "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and " + "following wait() returns count of completed ops.", + "buffer"_a, + "filename"_a) + + .def("async_pwrite", + &deepspeed_aio_handle_t::async_pwrite, + "Asynchronous parallel file write. Returns 0 on success, and following wait() returns " + "count of completed ops.", + "buffer"_a, + "filename"_a) + + .def("new_cpu_locked_tensor", + &deepspeed_aio_handle_t::new_cpu_locked_tensor, + "Allocate pinned CPU tensor.", + "num_elem"_a, + "example_tenosr"_a) + + .def("free_cpu_locked_tensor", + &deepspeed_aio_handle_t::free_cpu_locked_tensor, + "Free pinned CPU tensor.", + "tensor"_a) + + .def("wait", + &deepspeed_aio_handle_t::wait, + "Wait for (ongoing) asynchronous operations to complete"); } diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py index 24339ea5a54e..ba95150b11e1 100644 --- a/csrc/aio/py_test/aio_bench_perf_sweep.py +++ b/csrc/aio/py_test/aio_bench_perf_sweep.py @@ -20,11 +20,11 @@ OTHER_OPTIONS = '--handle' PERF_SCRIPT = 'test_ds_aio.py' DEFAULT_SWEEP_CONFIG = { - "block_size": ["128K", "256K"], - "queue_depth": [4, 16, 32], + "block_size": ["128K", "1M"], + "queue_depth": [32, 64, 128], "sequential_requests": [True, False], "single_submit": [False], - "io_parallel": [2, 8] + "io_parallel": [1, 2, 8], } @@ -95,8 +95,9 @@ def dump_cmd_lines(cmd_lines): def get_ftd_map(nvme_dir_list): - dir_list = [' '.join(nvme_dir_list[:(i + 1)]) for i in range(len(nvme_dir_list))] - return {'folder_to_device_mapping': dir_list} + ftd_list = [f'{dir}:{dev}' for dev, dir in enumerate(nvme_dir_list)] + ftd_arg = [' '.join(ftd for ftd in ftd_list)] + return {'folder_to_device_mapping': ftd_arg} def get_sweep_config_dict(sweep_config_json): diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py index 5fc3098d3357..346feabe4810 100644 --- a/csrc/aio/py_test/ds_aio_args.py +++ b/csrc/aio/py_test/ds_aio_args.py @@ -106,7 +106,7 @@ def parse_arguments(): nargs='+', help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).' 'Can be specified multiple times for multi-process runs,' - 'e.g. --path_map /mnt/nvme0:0 --path_map /mnt/nvme1:15 --gpu' + 'e.g. --folder_to_device_mapping /mnt/nvme0:0 --folder_to_device_mapping /mnt/nvme1:15 --gpu' 'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15') parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.') diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp index 10a7da1535ed..66eb34d4ea8c 100644 --- a/csrc/gds/py_lib/py_ds_gds.cpp +++ b/csrc/gds/py_lib/py_ds_gds.cpp @@ -16,11 +16,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) py::class_(m, "gds_handle") .def(py::init(), "GDS handle constructor", - "block_size"_a, - "queue_depth"_a, - "single_submit"_a, - "overlap_events"_a, - "num_threads"_a) + "block_size"_a = 1024 * 1024, + "queue_depth"_a = 128, + "single_submit"_a = false, + "overlap_events"_a = false, + "num_threads"_a = 1) .def("get_block_size", &deepspeed_gds_handle_t::get_block_size) .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth) @@ -28,24 +28,95 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events) .def("get_thread_count", &deepspeed_gds_handle_t::get_thread_count) - .def("read", &deepspeed_gds_handle_t::read) - .def("write", &deepspeed_gds_handle_t::write) + .def("read", + &deepspeed_gds_handle_t::read, + "Synchronous and non-parallel file read. Returns count of completed read ops", + "buffer"_a, + "filename"_a, + "validate"_a) - .def("pread", &deepspeed_gds_handle_t::pread) - .def("pwrite", &deepspeed_gds_handle_t::pwrite) + .def("write", + &deepspeed_gds_handle_t::write, + "Synchronous and non-parallel file write. Returns count of completed write ops", + "buffer"_a, + "filename"_a, + "validate"_a) - .def("sync_pread", &deepspeed_gds_handle_t::sync_pread) - .def("sync_pwrite", &deepspeed_gds_handle_t::sync_pwrite) - .def("async_pread", &deepspeed_gds_handle_t::async_pread) - .def("async_pwrite", &deepspeed_gds_handle_t::async_pwrite) + .def("pread", + &deepspeed_gds_handle_t::pread, + "Parallel file read with option of parallelism. Returns count of completed read ops", + "buffer"_a, + "filename"_a, + "validate"_a, + "async"_a) - .def("new_cpu_locked_tensor", &deepspeed_gds_handle_t::new_cpu_locked_tensor) - .def("free_cpu_locked_tensor", &deepspeed_gds_handle_t::free_cpu_locked_tensor) + .def("pwrite", + &deepspeed_gds_handle_t::pwrite, + "Parallel file write with option of parallelism. Returns count of completed write ops", + "buffer"_a, + "filename"_a, + "validate"_a, + "async"_a) - .def("new_pinned_device_tensor", &deepspeed_gds_handle_t::new_pinned_device_tensor) - .def("free_pinned_device_tensor", &deepspeed_gds_handle_t::free_pinned_device_tensor) - .def("pin_device_tensor", &deepspeed_gds_handle_t::pin_device_tensor) - .def("unpin_device_tensor", &deepspeed_gds_handle_t::unpin_device_tensor) + .def("sync_pread", + &deepspeed_gds_handle_t::sync_pread, + "Synchrononous parallel file read. Returns count of completed read ops", + "buffer"_a, + "filename"_a) - .def("wait", &deepspeed_gds_handle_t::wait); + .def("sync_pwrite", + &deepspeed_gds_handle_t::sync_pwrite, + "Synchronous parallel file write. Returns count of completed write ops", + "buffer"_a, + "filename"_a) + + .def("async_pread", + &deepspeed_gds_handle_t::async_pread, + "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and " + "following wait() returns count of completed ops.", + "buffer"_a, + "filename"_a) + + .def("async_pwrite", + &deepspeed_gds_handle_t::async_pwrite, + "Asynchronous parallel file write. Returns 0 on success, and following wait() returns " + "count of completed ops.", + "buffer"_a, + "filename"_a) + + .def("new_cpu_locked_tensor", + &deepspeed_gds_handle_t::new_cpu_locked_tensor, + "Allocate pinned CPU tensor.", + "num_elem"_a, + "example_tenosr"_a) + + .def("free_cpu_locked_tensor", + &deepspeed_gds_handle_t::free_cpu_locked_tensor, + "Free pinned CPU tensor.", + "tensor"_a) + + .def("new_pinned_device_tensor", + &deepspeed_gds_handle_t::new_pinned_device_tensor, + "Allocate pinned device tensor.", + "num_elem"_a, + "example_tenosr"_a) + + .def("free_pinned_device_tensor", + &deepspeed_gds_handle_t::free_pinned_device_tensor, + "Free pinned device tensor.", + "tensor"_a) + + .def("pin_device_tensor", + &deepspeed_gds_handle_t::pin_device_tensor, + "Pin device tensor.", + "tensor"_a) + + .def("unpin_device_tensor", + &deepspeed_gds_handle_t::unpin_device_tensor, + "Unpin device tensor.", + "tensor"_a) + + .def("wait", + &deepspeed_gds_handle_t::wait, + "Wait for (ongoing) asynchronous operations to complete"); }