From 6bd8cd95e908c3f9e3be2be4700fcdc3e9886d0e Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Thu, 9 May 2024 18:04:20 +0000
Subject: [PATCH 01/31] AIO API update

---
 csrc/aio/py_test/aio_bench_generate_param.py |   6 +-
 csrc/aio/py_test/aio_bench_perf_sweep.py     | 133 ++++-----------
 csrc/aio/py_test/ds_aio_args.py              | 168 +++++++++++++++++++
 csrc/aio/py_test/ds_aio_basic.py             |  27 +--
 csrc/aio/py_test/ds_aio_handle.py            | 108 ++++++++----
 csrc/aio/py_test/ds_aio_job.py               |  48 ++++++
 csrc/aio/py_test/run_read_sweep.sh           |  43 ++---
 csrc/aio/py_test/test_ds_aio.py              |  66 +-------
 csrc/aio/py_test/test_ds_aio_utils.py        |  41 ++++-
 csrc/aio/py_test/validate_async_io.py        |   1 +
 10 files changed, 400 insertions(+), 241 deletions(-)
 create mode 100644 csrc/aio/py_test/ds_aio_args.py
 create mode 100644 csrc/aio/py_test/ds_aio_job.py

diff --git a/csrc/aio/py_test/aio_bench_generate_param.py b/csrc/aio/py_test/aio_bench_generate_param.py
index 09d0e03c7ef6..7a0ab59ed73d 100644
--- a/csrc/aio/py_test/aio_bench_generate_param.py
+++ b/csrc/aio/py_test/aio_bench_generate_param.py
@@ -41,9 +41,9 @@ def convert_to_param(key):
     return {
         "single_submit": "true" if key[0] == "single" else "false",
         "overlap_events": "true" if key[1] == "overlap" else "false",
-        "thread_count": int(key[3]),
-        "queue_depth": int(key[4]),
-        "block_size": int(key[5])
+        "thread_count": int(key[5]),
+        "queue_depth": int(key[3]),
+        "block_size": int(key[4])
     }
 
 
diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index 7d55f7ded65c..5d4172066a6b 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -10,12 +10,11 @@
 import argparse
 import json
 import itertools
-import subprocess
 import shutil
 
-from test_ds_aio_utils import refine_integer_value
+from ds_aio_job import Job, run_job
 from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
-    READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+    READ_LOG_DIR, WRITE_LOG_DIR
 from deepspeed.ops.op_builder import AsyncIOBuilder
 
 OTHER_OPTIONS = '--handle'
@@ -23,62 +22,34 @@
 DEFAULT_SWEEP_CONFIG = {
     "block_size": ["128K", "256K"],
     "queue_depth": [4, 16, 32],
-    "overlap_events": [True, False],
-    "io_parallel": [2, 8],
-    "single_submit": [False]
+    "sequential_requests": [True, False],
+    "single_submit": [False],
+    "io_parallel": [2, 8]
 }
 
 
-class Job(object):
-
-    def __init__(self, cmd_line, output_file=None, work_dir=None):
-        self.cmd_line = cmd_line
-        self.output_file = output_file
-        self.work_dir = work_dir
-        self.output_fd = None
-
-    def cmd(self):
-        return self.cmd_line
-
-    def get_stdout(self):
-        return self.output_fd
-
-    def get_stderr(self):
-        return self.output_fd
-
-    def get_cwd(self):
-        return self.work_dir
-
-    def open_output_file(self):
-        if self.output_file is not None:
-            self.output_fd = open(self.output_file, 'w')
-
-    def close_output_file(self):
-        if self.output_fd is not None:
-            self.output_fd.close()
-            self.output_fd = None
-
-
 class SweepConfig(object):
 
     def __init__(self, args):
-        self.nvme_dir = args.nvme_dir
-        self.io_size = args.io_size
+        self.folder_to_device_mapping = get_ftd_map(args.nvme_dir)
         self.search_space = get_sweep_config_dict(args.sweep_config)
+        self.search_space.update(self.folder_to_device_mapping)
         self.read = not args.no_read
         self.write = not args.no_write
         self.flush_cache = not args.no_sudo
         self.log_dir = args.log_dir
-        self.loops = args.loops
-        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops}'
+        if args.gpu:
+            self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size} --gpu'
+        else:
+            self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
     parser.add_argument('--nvme_dir',
+                        nargs='+',
                         required=True,
-                        type=str,
                         help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
 
     parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
@@ -92,6 +63,8 @@ def parse_arguments():
                         default="400M",
                         help='Number of I/O bytes to read/write for performance measurements.')
 
+    parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.')
+
     parser.add_argument(
         '--no_sudo',
         action='store_true',
@@ -118,6 +91,11 @@ def dump_cmd_lines(cmd_lines):
         print(f'{i}:  {cmd}')
 
 
+def get_ftd_map(nvme_dir_list):
+    dir_list = [' '.join(nvme_dir_list[:(i + 1)]) for i in range(len(nvme_dir_list))]
+    return {'folder_to_device_mapping': dir_list}
+
+
 def get_sweep_config_dict(sweep_config_json):
     if sweep_config_json is None:
         return DEFAULT_SWEEP_CONFIG
@@ -148,16 +126,6 @@ def flatten_options(key, value_list):
     return cmd_list
 
 
-def run_job(job):
-    args = ' '.join(job.cmd())
-    print(f'args = {args}')
-    job.open_output_file()
-    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
-    job.close_output_file()
-    assert proc.returncode == 0, \
-    f"This command failed: {job.cmd()}"
-
-
 def launch_sweep(sweep_jobs, sync_job, flush_cache_job):
     for perf_job in sweep_jobs:
         if flush_cache_job is not None:
@@ -176,7 +144,12 @@ def create_cmd_tags(cmd_line):
         if len(fields) == 1:
             tags[fields[0]] = None
         elif len(fields) == 2:
-            tags[fields[0]] = fields[1]
+            if fields[0] == '--folder_to_device_mapping':
+                tags[fields[0]] = len(fields[1:])
+            else:
+                tags[fields[0]] = fields[1]
+        elif len(fields) > 2:
+            tags[fields[0]] = len(fields[1:])
     return tags
 
 
@@ -184,16 +157,16 @@ def get_log_file(io_op_desc, cmd_line):
     QUEUE_DEPTH = "--queue_depth"
     BLOCK_SIZE = "--block_size"
     SINGLE_SUBMIT = "--single_submit"
-    OVERLAP_EVENTS = "--overlap_events"
-    THREAD_COUNT = "--threads"
+    SEQUENTIAL_REQUESTS = "--sequential_requests"
+    FTD_MAP = "--folder_to_device_mapping"
     IO_PARALLEL = "--io_parallel"
 
     tag_map = {
         QUEUE_DEPTH: "d",
         BLOCK_SIZE: "bs",
         SINGLE_SUBMIT: "single",
-        OVERLAP_EVENTS: "overlap",
-        THREAD_COUNT: "t",
+        SEQUENTIAL_REQUESTS: "sequential",
+        FTD_MAP: "ftd",
         IO_PARALLEL: "p"
     }
 
@@ -201,14 +174,14 @@ def get_log_file(io_op_desc, cmd_line):
         QUEUE_DEPTH: 1,
         BLOCK_SIZE: "1M",
         SINGLE_SUBMIT: "block",
-        OVERLAP_EVENTS: "sequential",
-        THREAD_COUNT: 1,
+        SEQUENTIAL_REQUESTS: "overlap",
+        FTD_MAP: 1,
         IO_PARALLEL: 1
     }
 
     def get_default_value(tag):
         value = tag_default[tag]
-        if tag in [SINGLE_SUBMIT, OVERLAP_EVENTS]:
+        if tag in [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS]:
             return value
         return f'{tag_map[tag]}{value}'
 
@@ -218,7 +191,7 @@ def get_config_value(tag, value):
             return tag_key
         return f'{tag_key}{value}'
 
-    tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE]
+    tag_list = [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS, FTD_MAP, QUEUE_DEPTH, BLOCK_SIZE, IO_PARALLEL]
     log_tags = [io_op_desc]
     cmd_tags = create_cmd_tags(cmd_line)
     for tag in tag_list:
@@ -252,40 +225,14 @@ def async_io_setup():
     return AsyncIOBuilder().is_compatible()
 
 
-def get_block_size_and_count(io_bytes):
-    block_size = 1
-    block_count = io_bytes
-    bytes_in_KB = 1024
-
-    while block_count % bytes_in_KB == 0:
-        block_size *= bytes_in_KB
-        block_count /= bytes_in_KB
-
-    return int(block_size), int(block_count)
-
-
-def create_read_file(sweep_config):
-    read_folder = os.path.join(sweep_config.nvme_dir, f'{READ_IO_DIR}')
-    os.makedirs(read_folder, exist_ok=True)
-    read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
-    block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'])
-    print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-    run_job(dd_job)
-    print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-    return read_folder, read_file_name
-
-
 def remove_folder(folder):
     assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
     shutil.rmtree(folder)
 
 
 def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    read_folder, read_file_name = create_read_file(sweep_config)
-    read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
-    #dump_cmd_lines(read_cmd_lines)
+    read_cmd_lines = [[f'--read {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    #dump_cmd_lines(cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
     os.makedirs(log_folder, exist_ok=True)
@@ -294,15 +241,9 @@ def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
 
     launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
-    remove_folder(read_folder)
-
 
 def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    write_folder = os.path.join(sweep_config.nvme_dir, f'{WRITE_IO_DIR}')
-    os.makedirs(write_folder, exist_ok=True)
-    write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
-    write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    write_cmd_lines = [[f'{sweep_config.other_options}'] + cmd for cmd in cmd_lines]
     #dump_cmd_lines(write_cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
@@ -312,8 +253,6 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
 
     launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
-    remove_folder(write_folder)
-
 
 def main():
     print("Running performance sweep of deepspeed nvme library")
diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py
new file mode 100644
index 000000000000..66d843d68ea2
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_args.py
@@ -0,0 +1,168 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import argparse
+import os
+from test_ds_aio_utils import refine_integer_value
+from deepspeed.accelerator import get_accelerator
+
+MAPPING_DELIMITER = ':'
+
+
+def refine_args(args):
+    if args.io_size and type(args.io_size) == str:
+        args.io_size = refine_integer_value(args.io_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+    return args
+
+
+def _get_mapping_dict(args):
+    if args.folder is not None:
+        d = {i: args.folder for i in range(args.multi_process)}
+    else:
+        d = {}
+        for m in args.folder_to_device_mapping:
+            fields = m.split(MAPPING_DELIMITER)
+            d[fields[1]] = fields[0]
+
+    return d
+
+
+def _validate_folder_mapping(args):
+    no_error = True
+    error_messages = []
+    invalid_mappings = [m for m in args.folder_to_device_mapping if MAPPING_DELIMITER not in m]
+    if len(invalid_mappings) > 0:
+        error_messages.append(
+            f'Missing delimiter ({MAPPING_DELIMITER}) in folder_to_device_mapping {invalid_mappings}')
+        no_error = False
+
+    folder_list = [m.split(MAPPING_DELIMITER)[0] for m in args.folder_to_device_mapping]
+    invalid_folders = [d for d in folder_list if not os.path.exists(d)]
+    if len(invalid_folders) > 0:
+        error_messages.append(f'Invalid folders in folder_to_device_mapping: {invalid_folders}')
+        no_error = False
+
+    if args.gpu:
+        device_list = [int(m.split(MAPPING_DELIMITER)[1]) for m in args.folder_to_device_mapping]
+        invalid_device_list = [dev_id for dev_id in device_list if not dev_id < get_accelerator().device_count()]
+        if len(invalid_device_list) > 0:
+            error_messages.append(f'Invalid device ids in folder_to_device_mapping: {invalid_device_list}')
+            no_error = False
+
+    return no_error, error_messages
+
+
+def validate_args(args):
+    no_error = True
+    error_messages = []
+
+    if args.folder is not None and len(args.folder_to_device_mapping) > 0:
+        error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
+        no_error = False
+    elif args.folder is None and len(args.folder_to_device_mapping) == 0:
+        error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
+        no_error = False
+
+    # Validate --folder
+    if args.folder is not None and not os.path.exists(args.folder):
+        no_error = False
+        error_messages.append(f'Invalid folder in --folder: {args.folder} ')
+
+    # Validate --folder_mapping_to_device
+    if len(args.folder_to_device_mapping) > 0:
+        no_mapping_error, mapping_error_messages = _validate_folder_mapping(args)
+        no_error = no_error and no_mapping_error
+        error_messages += mapping_error_messages
+
+    if not no_error:
+        print(f'Found {len(error_messages)} validation errors')
+        for i, msg in enumerate(error_messages):
+            print(f'{i+1}: {msg}')
+
+    return no_error
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--folder', default=None, type=str, help='Folder to use for I/O.')
+
+    parser.add_argument('--folder_to_device_mapping',
+                        default=[],
+                        nargs='+',
+                        help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).'
+                        'Can be specified multiple times for multi-process runs,'
+                        'e.g. --path_map /mnt/nvme0:0 --path_map /mnt/nvme1:15 --gpu'
+                        'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15')
+
+    parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
+
+    parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')
+
+    parser.add_argument('--multi_process',
+                        type=int,
+                        default=1,
+                        help='Number of parallel processes doing I/O (default 1).')
+
+    parser.add_argument('--block_size',
+                        type=str,
+                        default='1M',
+                        help='I/O block size. Can use K, M, or G suffix (default 1M for 1 megabytes).')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth (default 32).')
+
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
+
+    parser.add_argument(
+        '--sequential_requests',
+        action='store_true',
+        help=
+        'Delay I/O request submission until completion of prior requests (default is overlap I/O submission and completion requests.).'
+    )
+
+    parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    parser.add_argument('--slow_bounce_buffer',
+                        action='store_true',
+                        help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def get_validated_args():
+    args = parse_arguments()
+    args = refine_args(args)
+    if not validate_args(args):
+        quit()
+    print(f'Successful validation of command line arguments')
+
+    peer_tag = 'gpu' if args.gpu else 'process'
+    args.mapping_dict = _get_mapping_dict(args)
+    args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]
+    assert len(args.mapping_dict) == len(args.mapping_list)
+    print(f'Configuring {len(args.mapping_list)} {peer_tag} to folder mapping')
+    for i, (device_id, folder) in enumerate(args.mapping_list):
+        print(f'[{i}]: {peer_tag} {device_id} <----> {folder}')
+
+    return args
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
index ad2a4349cd0c..e90886e17871 100755
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -9,10 +9,9 @@
 import torch
 import os
 import time
+from deepspeed.ops.aio import AsyncIOBuilder
 from multiprocessing import Pool, Barrier
 from test_ds_aio_utils import report_results, task_log, task_barrier
-from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import AsyncIOBuilder
 
 
 def pre_basic(args, tid, read_op):
@@ -21,7 +20,7 @@ def pre_basic(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
     task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
 
     ctxt = {}
@@ -56,7 +55,7 @@ def main_basic_read(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
     AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                     args.single_submit, args.overlap_events, args.validate)
+                                     args.single_submit, not args.sequential_requests, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -67,7 +66,7 @@ def main_basic_write(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
     AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                      args.single_submit, args.overlap_events, args.validate)
+                                      args.single_submit, not args.sequential_requests, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -90,16 +89,17 @@ def get_schedule(args, read_op):
 
 def _aio_handle_tasklet(pool_params):
     args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
 
     # Create schedule
     schedule = get_schedule(args, read_op)
     task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run pre task
     task_log(tid, f'running pre-task')
     ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run main tasks in a loop
     ctxt["main_task_sec"] = 0
@@ -107,27 +107,28 @@ def _aio_handle_tasklet(pool_params):
         task_log(tid, f'running main task {i}')
         start_time = time.time()
         ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
+        task_barrier(aio_barrier, num_processes)
         stop_time = time.time()
         ctxt["main_task_sec"] += stop_time - start_time
 
     # Run post task
     task_log(tid, f'running post-task')
     ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
 
-def _init_tasklet(b):
+def _init_takslet(b):
     global aio_barrier
     aio_barrier = b
 
 
 def aio_basic_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index d35b2713edae..369cb9d4030f 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -10,40 +10,48 @@
 import os
 import time
 from multiprocessing import Pool, Barrier
-from test_ds_aio_utils import report_results, task_log, task_barrier
+from deepspeed.ops.aio import AsyncIOBuilder
+from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import AsyncIOBuilder
+
+BUFFER = 'buffer'
+BOUNCE_BUFFER = 'bounce_buffer'
 
 
 def pre_handle(args, tid, read_op):
     io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                args.overlap_events, io_parallel)
-    task_log(tid, f'Created deepspeed aio handle')
+    device_id, folder = args.mapping_list[tid]
+    filename = create_filename(folder, args.read, args.io_size, tid)
+    if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+        create_file(filename, args.io_size)
 
+    task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+    bounce_buffer = None
     if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name())
+        device_name = get_accelerator().device_name(device_id)
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
+        if not args.slow_bounce_buffer:
+            bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
+                                          device='cpu').pin_memory()
     else:
-        if args.use_accelerator_pin_memory:
-            buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
-        else:
-            buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8))
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid,
+             f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+             force=True)
 
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
 
     ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
+    ctxt['file'] = filename
+    ctxt['num_bytes'] = args.io_size
     ctxt['handle'] = handle
-    ctxt['buffer'] = buffer
+    ctxt[BUFFER] = buffer
+    ctxt[BOUNCE_BUFFER] = bounce_buffer
     ctxt['elapsed_sec'] = 0
 
-    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
-
     return ctxt
 
 
@@ -61,8 +69,10 @@ def pre_handle_write(pool_params):
 
 def post_handle(pool_params):
     _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
+    for buf in [BUFFER, BOUNCE_BUFFER]:
+        if ctxt[buf] is not None:
+            ctxt[buf].detach()
+            ctxt[buf] = None
     return ctxt
 
 
@@ -71,9 +81,12 @@ def main_parallel_read(pool_params):
     handle = ctxt['handle']
 
     start_time = time.time()
-    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True)
     assert ret != -1
     handle.wait()
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -82,9 +95,18 @@ def main_parallel_read(pool_params):
 
 def main_parallel_write(pool_params):
     args, tid, ctxt = pool_params
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
     handle = ctxt['handle']
     start_time = time.time()
-    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
     assert ret != -1
     handle.wait()
     end_time = time.time()
@@ -98,8 +120,11 @@ def main_handle_read(pool_parms):
     handle = ctxt['handle']
 
     start_time = time.time()
-    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
     assert ret != -1
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -108,9 +133,18 @@ def main_handle_read(pool_parms):
 
 def main_handle_write(pool_parms):
     args, tid, ctxt = pool_parms
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
     handle = ctxt['handle']
     start_time = time.time()
-    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
     assert ret != -1
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
@@ -123,27 +157,28 @@ def get_schedule(args, read_op):
     if read_op:
         schedule['pre'] = pre_handle_read
         schedule['post'] = post_handle
-        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
+        schedule['main'] = main_parallel_read
     else:
         schedule['pre'] = pre_handle_write
         schedule['post'] = post_handle
-        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
+        schedule['main'] = main_parallel_write
 
     return schedule
 
 
 def _aio_handle_tasklet(pool_params):
     args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
 
     # Create schedule
     schedule = get_schedule(args, read_op)
     task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run pre task
     task_log(tid, f'running pre-task')
     ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run main tasks in a loop
     ctxt["main_task_sec"] = 0
@@ -151,27 +186,28 @@ def _aio_handle_tasklet(pool_params):
         task_log(tid, f'running main task {i}')
         start_time = time.time()
         ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
+        task_barrier(aio_barrier, num_processes)
         stop_time = time.time()
         ctxt["main_task_sec"] += stop_time - start_time
 
     # Run post task
     task_log(tid, f'running post-task')
     ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
 
-def _init_tasklet(b):
+def _init_takslet(b):
     global aio_barrier
     aio_barrier = b
 
 
 def aio_handle_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_job.py b/csrc/aio/py_test/ds_aio_job.py
new file mode 100644
index 000000000000..bbddee1bf26d
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_job.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import subprocess
+
+
+class Job(object):
+
+    def __init__(self, cmd_line, output_file=None, work_dir=None):
+        self.cmd_line = cmd_line
+        self.output_file = output_file
+        self.work_dir = work_dir
+        self.output_fd = None
+
+    def cmd(self):
+        return self.cmd_line
+
+    def get_stdout(self):
+        return self.output_fd
+
+    def get_stderr(self):
+        return self.output_fd
+
+    def get_cwd(self):
+        return self.work_dir
+
+    def open_output_file(self):
+        if self.output_file is not None:
+            self.output_fd = open(self.output_file, 'w')
+
+    def close_output_file(self):
+        if self.output_fd is not None:
+            self.output_fd.close()
+            self.output_fd = None
+
+
+def run_job(job):
+    args = ' '.join(job.cmd())
+    print(f'args = {args}')
+    job.open_output_file()
+    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
+    job.close_output_file()
+    assert proc.returncode == 0, \
+    f"This command failed: {job.cmd()}"
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index b9d7e050454a..d69aa13e49da 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <input file> <output log dir>"
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <io_size> <output log dir> <target_gpu>"
     exit 1
 fi
 
 
 function validate_environment()
 {
-    validate_cmd="python ./validate_async_io.py"
+    validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python ./validate_async_io.py"
     eval ${validate_cmd}
     res=$?
     if [[ $res != 0 ]]; then
@@ -20,15 +20,11 @@ function validate_environment()
 
 validate_environment
 
-INPUT_FILE=$1
-if [[ ! -f ${INPUT_FILE} ]]; then
-    echo "Input file not found: ${INPUT_FILE}"
-    exit 1
-fi
-
+IO_SIZE=$1
 LOG_DIR=$2/aio_perf_sweep
+GPU_MEM=$3
 RUN_SCRIPT=./test_ds_aio.py
-READ_OPT="--read_file ${INPUT_FILE}"
+READ_OPT="--read"
 
 if [[ -d ${LOG_DIR} ]]; then
     rm -f ${LOG_DIR}/*
@@ -36,34 +32,41 @@ else
     mkdir -p ${LOG_DIR}
 fi
 
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+if [[ ${GPU_MEM} == "gpu" ]]; then
+    gpu_opt="--gpu"
+else
+    gpu_opt=""
+fi
+
+DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
 
 for sub in single block; do
+    ftd_map="--folder_to_device_mapping \
+             /workspace/nvme01/aio:0 "
     if [[ $sub == "single" ]]; then
         sub_opt="--single_submit"
     else
         sub_opt=""
     fi
     for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
+        if [[ $ov == "sequential" ]]; then
+            ov_opt="--sequential_requests"
         else
             ov_opt=""
         fi
-        for t in 1 2 4 8; do
-            for p in 1 ; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+        for p in 1 ; do
+            for t in 1 2 4 8; do
+                for d in 8 16 32; do
+                    for bs in 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} ${ftd_map}"
+                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE}"
                         LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                         cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
                         echo ${DISABLE_CACHE}
                         echo ${cmd}
                         echo ${SYNC}
 
-                        eval ${DISABLE_CACHE}
                         eval ${cmd}
                         eval ${SYNC}
                         sleep 2
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
index e6242cb35789..6de72755e9e5 100755
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -6,79 +6,19 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
-import os
-import argparse
 import multiprocessing as mp
 from ds_aio_basic import aio_basic_multiprocessing
 from ds_aio_handle import aio_handle_multiprocessing
-from test_ds_aio_utils import refine_args
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
-
-    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
-
-    parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.')
-
-    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
-
-    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
-
-    parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.')
-
-    parser.add_argument('--single_submit',
-                        action='store_true',
-                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
-
-    parser.add_argument('--overlap_events',
-                        action='store_true',
-                        help='Overlap I/O submission and completion requests.')
-
-    parser.add_argument('--validate', action='store_true', help='Perform validation in library.')
-
-    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
-
-    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
-
-    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
-
-    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
-
-    parser.add_argument('--use_accelerator_pin_memory',
-                        action='store_true',
-                        help='Obtain pinned (CPU page-locked) tensors from accelerator')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def validate_args(args):
-    if args.read_file and not os.path.isfile(args.read_file):
-        print(f'args validation error: {args.read_file} not found')
-        return False
-
-    return True
+from ds_aio_args import get_validated_args
 
 
 def main():
     print(f'Testing deepspeed_aio python frontend')
 
-    args = parse_arguments()
-    refine_args(args)
-    if not validate_args(args):
-        quit()
-
+    args = get_validated_args()
     mp.set_start_method('spawn')
     multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
-    if args.read_file:
-        multiprocess_function(args, True)
-
-    if args.write_file:
-        multiprocess_function(args, False)
+    multiprocess_function(args, args.read)
 
 
 if __name__ == "__main__":
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
index 6aad114c0bdc..968ff4a60ef9 100755
--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -6,12 +6,17 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
+import os
+from ds_aio_job import Job, run_job
+
 BYTES_PER_GB = 1024**3
+BYTES_PER_MB = 1024**2
+BYTES_PER_KB = 1024
 LOG_TIDS = [0]
 
 
-def task_log(tid, msg):
-    if tid in LOG_TIDS:
+def task_log(tid, msg, force=False):
+    if force or tid in LOG_TIDS:
         print(f'tid {tid}: {msg}')
 
 
@@ -31,16 +36,29 @@ def report_results(args, read_op, pool_results):
     total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
 
     task_latency_sec = max([sec for _, sec, _ in pool_results])
-    task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
+    task_speed_GB = 0 if task_latency_sec == 0 else total_bytes / task_latency_sec / BYTES_PER_GB
     print(f'Task {io_string} Latency = {task_latency_sec} sec')
     print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
 
     e2e_latency_sec = max([sec for sec, _, _ in pool_results])
-    e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
+    e2e_speed_GB = 0 if e2e_latency_sec == 0 else total_bytes / e2e_latency_sec / BYTES_PER_GB
     print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
     print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
 
 
+def get_block_size_and_count(io_bytes):
+    if io_bytes > BYTES_PER_MB and io_bytes % BYTES_PER_MB == 0:
+        block_size = BYTES_PER_MB
+        block_size_string = '1M'
+    else:
+        assert io_bytes % BYTES_PER_KB == 0
+        block_size = BYTES_PER_KB
+        block_size_string = '1K'
+    block_count = io_bytes / block_size
+
+    return block_size_string, int(block_count)
+
+
 def refine_integer_value(value):
     unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
 
@@ -50,9 +68,14 @@ def refine_integer_value(value):
     return int(value)
 
 
-def refine_args(args):
-    if args.write_size and type(args.write_size) == str:
-        args.write_size = refine_integer_value(args.write_size)
+def create_filename(folder, read_op, size, tid):
+    io_string = "read" if read_op else "write"
+    return os.path.join(folder, f'_aio_{io_string}_{size}.pt.{tid}')
+
 
-    if args.block_size and type(args.block_size) == str:
-        args.block_size = refine_integer_value(args.block_size)
+def create_file(filename, num_bytes):
+    block_size, block_count = get_block_size_and_count(num_bytes)
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={filename} bs={block_size} count={block_count}'])
+    print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
+    run_job(dd_job)
+    print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py
index 019ec05d49d3..10fb638347bc 100644
--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
@@ -7,3 +7,4 @@
 """
 from deepspeed.ops.op_builder import AsyncIOBuilder
 assert AsyncIOBuilder().is_compatible()
+assert AsyncIOBuilder().load()

From 3abe3835b6fb3e7e7e7a38977af9059000dc2378 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Tue, 14 May 2024 19:03:08 +0000
Subject: [PATCH 02/31] fixing sweep scripts

---
 csrc/aio/py_test/run_read_sweep.sh  | 14 ++++-----
 csrc/aio/py_test/run_write_sweep.sh | 47 +++++++++++++++++------------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index d69aa13e49da..56c81f41eb70 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -22,6 +22,7 @@ validate_environment
 
 IO_SIZE=$1
 LOG_DIR=$2/aio_perf_sweep
+MAP_DIR=$2/aio
 GPU_MEM=$3
 RUN_SCRIPT=./test_ds_aio.py
 READ_OPT="--read"
@@ -42,8 +43,6 @@ DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
 
 for sub in single block; do
-    ftd_map="--folder_to_device_mapping \
-             /workspace/nvme01/aio:0 "
     if [[ $sub == "single" ]]; then
         sub_opt="--single_submit"
     else
@@ -55,21 +54,22 @@ for sub in single block; do
         else
             ov_opt=""
         fi
-        for p in 1 ; do
+        for p in 1 2 4 8; do
             for t in 1 2 4 8; do
-                for d in 8 16 32; do
+                for d in 16 32 64; do
                     for bs in 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} ${ftd_map}"
-                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE}"
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}"
+                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                         cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
                         echo ${DISABLE_CACHE}
                         echo ${cmd}
                         echo ${SYNC}
 
+                        eval ${DISABLE_CACHE}
                         eval ${cmd}
                         eval ${SYNC}
-                        sleep 2
+                        sleep 1
                     done
                 done
             done
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
index 99f2113dda6f..d8abc6869c50 100755
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -25,25 +25,32 @@ function validate_environment()
 
 validate_environment
 
-if [[ $# -ne 3 ]]; then
-    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <write size in MB> <output log dir><gpu(optional>"
     exit 1
 fi
 
-SIZE="$1M"
-WRITE_DIR=$2
-LOG_DIR=$3/aio_perf_sweep
+IO_SIZE=$1
+LOG_DIR=$2/aio_perf_sweep
+MAP_DIR=$2/aio
+GPU_MEM=$3
+RUN_SCRIPT=./test_ds_aio.py
 
-OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
-WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
+OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt
+WRITE_OPT=""
 
 
-prep_folder ${WRITE_DIR}
+prep_folder ${MAP_DIR}
 prep_folder ${LOG_DIR}
 
-RUN_SCRIPT=./test_ds_aio.py
 
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+if [[ ${GPU_MEM} == "gpu" ]]; then
+    gpu_opt="--gpu"
+else
+    gpu_opt=""
+fi
+
+DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
 
 for sub in single block; do
@@ -53,19 +60,19 @@ for sub in single block; do
         sub_opt=""
     fi
     for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
+        if [[ $ov == "sequential" ]]; then
+            ov_opt="--sequential_requests"
         else
             ov_opt=""
         fi
-        for t in 1 2 4 8; do
-            for p in 1; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+        for p in 1 2 4 8; do
+            for t in 1 2 4 8; do
+                for d in 16 32 64; do
+                    for bs in 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}"
+                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
                         echo ${DISABLE_CACHE}
                         echo ${cmd}
                         echo ${SYNC}
@@ -73,7 +80,7 @@ for sub in single block; do
                         eval ${DISABLE_CACHE}
                         eval ${cmd}
                         eval ${SYNC}
-                        sleep 2
+                        sleep 1
                     done
                 done
         done

From 1b5dd7462fc01c2e4bf97465482d18606bfc7adf Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Fri, 17 May 2024 21:44:27 +0000
Subject: [PATCH 03/31] adding gds op

---
 csrc/aio/py_lib/deepspeed_aio_op_desc.cpp   |  38 +++++
 csrc/aio/py_lib/deepspeed_aio_op_desc.h     |  41 +++++
 csrc/aio/py_lib/deepspeed_aio_thread.cpp    |  55 +------
 csrc/aio/py_lib/deepspeed_aio_thread.h      |  24 +--
 csrc/aio/py_lib/deepspeed_cpu_op.cpp        |  62 ++++++++
 csrc/aio/py_lib/deepspeed_cpu_op.h          |  30 ++++
 csrc/aio/py_lib/deepspeed_gds_op.cpp        | 162 ++++++++++++++++++++
 csrc/aio/py_lib/deepspeed_gds_op.h          |  44 ++++++
 csrc/aio/py_lib/deepspeed_gds_utils.h       |  91 +++++++++++
 csrc/aio/py_lib/deepspeed_py_aio.cpp        |   3 -
 csrc/aio/py_lib/deepspeed_py_aio.h          |   5 +-
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp |  62 ++++++--
 csrc/aio/py_lib/deepspeed_py_aio_handle.h   |   8 +
 csrc/aio/py_lib/deepspeed_py_copy.cpp       |   2 +-
 csrc/aio/py_lib/deepspeed_py_copy.h         |   3 -
 csrc/aio/py_lib/py_ds_aio.cpp               |  13 +-
 csrc/aio/py_test/ds_aio_args.py             |   7 +
 csrc/aio/py_test/ds_aio_handle.py           |   7 +-
 csrc/aio/py_test/run_read_sweep.sh          |   8 +-
 csrc/aio/py_test/run_write_sweep.sh         |  13 +-
 op_builder/async_io.py                      |  50 +++---
 21 files changed, 596 insertions(+), 132 deletions(-)
 create mode 100644 csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_aio_op_desc.h
 create mode 100644 csrc/aio/py_lib/deepspeed_cpu_op.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_cpu_op.h
 create mode 100644 csrc/aio/py_lib/deepspeed_gds_op.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_gds_op.h
 create mode 100644 csrc/aio/py_lib/deepspeed_gds_utils.h

diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
new file mode 100644
index 000000000000..5c9bb033c0c2
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_aio_op_desc.h"
+
+using namespace std;
+
+io_op_desc_t::io_op_desc_t(const bool read_op,
+                           const torch::Tensor& buffer,
+                           const int fd,
+                           const char* filename,
+                           const long long int file_num_bytes,
+                           const int num_threads,
+                           const bool validate)
+    : _read_op(read_op),
+      _buffer(buffer),
+      _fd(fd),
+      _filename(filename),
+      _file_num_bytes(file_num_bytes),
+      _num_threads(num_threads),
+      _num_bytes_per_thread(file_num_bytes / num_threads),
+      _validate(validate)
+{
+}
+
+char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void io_op_desc_t::fini() {}
+
+void io_op_desc_t::validate() {}
+
+void io_op_desc_t::run(const int tid,
+                       std::unique_ptr<aio_context>& aio_ctxt,
+                       deepspeed_aio_config_t* aio_config)
+{
+}
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
new file mode 100644
index 000000000000..c5bffae10265
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#ifndef _IO_OP_DESC_T_
+#define _IO_OP_DESC_T_
+#include <memory>
+#include <queue>
+#include "deepspeed_py_aio.h"
+
+struct io_op_desc_t {
+    const bool _read_op;
+    torch::Tensor _buffer;
+    int _fd;
+    const std::string _filename;
+    const long long int _file_num_bytes;
+    const int _num_threads;
+    const int _num_bytes_per_thread;
+    torch::Tensor _contiguous_buffer;
+    const bool _validate;
+
+    io_op_desc_t(const bool read_op,
+                 const torch::Tensor& buffer,
+                 const int fd,
+                 const char* filename,
+                 const long long int file_num_bytes,
+                 const int num_threads,
+                 const bool validate);
+
+    virtual void run(const int tid,
+                     std::unique_ptr<aio_context>& aio_ctxt,
+                     deepspeed_aio_config_t* aio_config);
+
+    virtual char* data_ptr() const;
+
+    virtual void validate();
+
+    virtual void fini();
+};
+#endif  // _IO_OP_DESC_T_
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
index c852711a28c0..30c3b4914397 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -9,50 +9,8 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 #include "deepspeed_aio_thread.h"
 
-#if defined(__ENABLE_CANN__)
-#include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include "torch_npu/csrc/framework/utils/UtilForOpAdapter.h"
-#endif
-
 using namespace std;
 
-io_op_desc_t::io_op_desc_t(const bool read_op,
-                           const torch::Tensor& buffer,
-                           const int fd,
-                           const char* filename,
-                           const long long int num_bytes,
-                           const bool validate)
-    : _read_op(read_op),
-      _buffer(buffer),
-      _fd(fd),
-      _filename(filename),
-      _num_bytes(num_bytes),
-      _validate(validate)
-{
-    _cpu_buffer = (_buffer.is_cuda() || _buffer.is_xpu()
-#if defined(__ENABLE_CANN__)
-                   || torch_npu::utils::is_npu(_buffer)
-#endif
-                       )
-                      ? _buffer.to(torch::kCPU).pin_memory()
-                      : _buffer;
-    _contiguous_buffer = _cpu_buffer.contiguous();
-}
-
-char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void io_op_desc_t::fini()
-{
-    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-    if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
-#if defined(__ENABLE_CANN__)
-    if (_read_op && torch_npu::utils::is_npu(_buffer)) {
-        auto device = at::Device("npu:0");
-        _buffer.copy_(_cpu_buffer.to(device));
-    }
-#endif
-}
-
 deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
     : _tid(tid),
       _aio_config(aio_config),
@@ -79,18 +37,7 @@ void deepspeed_aio_thread_t::run()
         }
 
         if (next_io_op) {
-            const auto base_offset = next_io_op->_num_bytes * _tid;
-
-            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
-                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
-
-            if (_aio_config._overlap_events) {
-                do_aio_operation_overlap(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            } else {
-                do_aio_operation_sequential(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            }
+            next_io_op->run(_tid, _aio_ctxt, &_aio_config);
 
             {
                 std::lock_guard<std::mutex> lock(_complete_sync._mutex);
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
index 20799ecbb018..3cb3c5c3731f 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -10,28 +10,8 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <condition_variable>
 #include <memory>
 #include <queue>
-#include "deepspeed_py_aio.h"
-
-struct io_op_desc_t {
-    const bool _read_op;
-    torch::Tensor _buffer;
-    int _fd;
-    const std::string _filename;
-    const long long int _num_bytes;
-    torch::Tensor _cpu_buffer;
-    torch::Tensor _contiguous_buffer;
-    const bool _validate;
-
-    io_op_desc_t(const bool read_op,
-                 const torch::Tensor& buffer,
-                 const int fd,
-                 const char* filename,
-                 const long long int num_bytes,
-                 const bool validate);
-
-    char* data_ptr() const;
-    void fini();
-};
+#include "deepspeed_cpu_op.h"
+#include "deepspeed_gds_op.h"
 
 struct thread_sync_t {
     std::mutex _mutex;
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
new file mode 100644
index 000000000000..6a1696598ed8
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_cpu_op.h"
+
+using namespace std;
+
+cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const long long int file_num_bytes,
+                             const int num_threads,
+                             const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
+      _cpu_buffer(buffer)
+{
+    if (_buffer.is_cuda()) {
+        if (_read_op) {
+            auto options = torch::TensorOptions()
+                               .dtype(_buffer.dtype())
+                               .layout(_buffer.layout())
+                               .device(torch::kCPU);
+            _cpu_buffer = torch::empty(_buffer.nbytes(), options).pin_memory();
+        } else {
+            _cpu_buffer = _buffer.to(torch::kCPU).pin_memory();
+        }
+    }
+
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void cpu_op_desc_t::fini()
+{
+    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+}
+
+void cpu_op_desc_t::validate()
+{
+    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
+}
+
+void cpu_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _num_threads);
+    const auto base_offset = _num_bytes_per_thread * tid;
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr()));
+
+    if (aio_config->_overlap_events) {
+        do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    }
+}
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
new file mode 100644
index 000000000000..d61fe4f3c545
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <memory>
+#include <queue>
+#include "deepspeed_aio_op_desc.h"
+
+struct cpu_op_desc_t : io_op_desc_t {
+    torch::Tensor _cpu_buffer;
+
+    cpu_op_desc_t(const bool read_op,
+                  const torch::Tensor& buffer,
+                  const int fd,
+                  const char* filename,
+                  const long long int file_num_bytes,
+                  const int num_threads,
+                  const bool validate);
+
+    void run(const int tid,
+             std::unique_ptr<aio_context>& aio_ctxt,
+             deepspeed_aio_config_t* aio_config);
+
+    char* data_ptr() const;
+
+    void validate();
+
+    void fini();
+};
diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp
new file mode 100644
index 000000000000..077f0be84c8e
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp
@@ -0,0 +1,162 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_gds_op.h"
+#include <cstdlib>
+#include <set>
+
+using namespace std;
+
+// For when there is more than 1 device
+// static std::set<char*> base_buffer_registry;
+static std::map<const int64_t, std::set<void*>> base_ptr_registry;
+
+void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle)
+{
+    memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t));
+    cf_descr.handle.fd = fd;
+    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+    CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
+    if (status.err != CU_FILE_SUCCESS) {
+        std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+        close(fd);
+        exit(EXIT_FAILURE);
+    }
+}
+
+gds_op_desc_t::gds_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const long long int file_num_bytes,
+                             const int num_threads,
+                             const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate)
+{
+    // assert(_buffer.is_cuda());
+    _contiguous_buffer = _buffer.contiguous();
+
+    const int64_t device = _buffer.get_device();
+
+    char * buf_ptr = (char *)_contiguous_buffer.data_ptr();
+    int64_t last = -1;
+    int64_t ptr_diff;
+    for (const auto& value : base_ptr_registry[device]) {
+        ptr_diff = buf_ptr - (char *)value;
+        if (last == -1 && ptr_diff >= 0) {
+            last = ptr_diff;
+            _base_ptr = value;
+        }
+        else if ( ptr_diff < last && ptr_diff >= 0) {
+            last = ptr_diff;
+            _base_ptr = value;
+        }
+    }
+    if (_contiguous_buffer.data_ptr() < _base_ptr) {
+        std::cerr << "BASE PTR ERROR :" << _base_ptr << " BUF PTR " << _contiguous_buffer.data_ptr() << std::endl;
+        for (const auto& value : base_ptr_registry[device]) {
+            std::cerr << "BASE PTR AVAIL :" << value  << std::endl;
+        }
+        exit(EXIT_FAILURE);
+    }
+    // _base_ptr = _contiguous_buffer.data_ptr();
+
+    check_cudaruntimecall(cudaSetDevice(device));
+
+    _safe_handle_register(fd, _cf_descr, _cf_handle);
+
+}
+
+char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void gds_op_desc_t::fini()
+{
+    //check_cuFileCall(cuFileBufDeregister(_buffer.data_ptr()), "file buffer deregister");
+    cuFileHandleDeregister(_cf_handle);
+}
+
+void gds_op_desc_t::validate()
+{
+
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    const auto cpu_buffer = _buffer.to(torch::kCPU);
+    validate_aio_operation(
+        _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes);
+}
+
+void gds_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _num_threads);
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char *)_base_ptr;
+    const auto file_offset = _num_bytes_per_thread * tid;
+
+    if (_read_op) {
+        auto ret = cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+    } else {
+        auto ret = cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+    }
+}
+
+void gds_op_desc_t::_report_error(const ssize_t return_code,
+                                  const int error_num,
+                                  const off_t offset)
+{
+    const auto op_string = _read_op ? "read failed with " : "write failed with ";
+    const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: ";
+    const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code)
+                                                       : cuFileGetErrorString(error_num);
+    std::cerr << op_string << error_string << error_code << " return code = " << return_code
+              << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread
+              << " offset = " << offset << std::endl;
+    exit(EXIT_FAILURE);
+}
+
+int register_buffer(const torch::Tensor& buffer)
+{
+    const int64_t device = buffer.get_device();
+    void * reg_ptr = buffer.data_ptr();
+
+    // std::cout << "REG PTR " <<  reg_ptr << std::endl;
+    // TODO: add checking to make sure pointer isn't already in set
+    const auto it = base_ptr_registry.find(device);
+    if (it == base_ptr_registry.end()) {
+        std::set<void *> new_ptr_set;
+        new_ptr_set.insert(reg_ptr);
+        base_ptr_registry.insert(std::pair<const int64_t, std::set<void *>>(device, new_ptr_set));
+    } else {
+        base_ptr_registry[device].insert(reg_ptr);
+    }
+
+    check_cudaruntimecall(cudaSetDevice(device));
+    CUfileError_t status = cuFileBufRegister(reg_ptr, buffer.nbytes(), 0);
+    if (status.err != CU_FILE_SUCCESS) {
+        std::cerr << "buffer register failed:" << cuFileGetErrorString(status) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+    return 0;
+}
+
+int deregister_buffer(const torch::Tensor& buffer)
+{
+    const int64_t device = buffer.get_device();
+    void * reg_ptr = buffer.data_ptr();
+
+    // std::cout << "DEREG PTR " <<  reg_ptr << std::endl;
+    check_cudaruntimecall(cudaSetDevice(device));
+    cuFileBufDeregister(reg_ptr);
+
+    // Remove from tracked registry
+    base_ptr_registry[device].erase(reg_ptr);
+    return 0;
+}
diff --git a/csrc/aio/py_lib/deepspeed_gds_op.h b/csrc/aio/py_lib/deepspeed_gds_op.h
new file mode 100644
index 000000000000..21f466ecac12
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_gds_op.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <memory>
+#include <queue>
+
+#include "deepspeed_aio_op_desc.h"
+#include "deepspeed_gds_utils.h"
+
+struct gds_op_desc_t : io_op_desc_t {
+    CUfileDescr_t _cf_descr;
+    CUfileHandle_t _cf_handle;
+    void* _base_ptr;
+
+    gds_op_desc_t(const bool read_op,
+                  const torch::Tensor& buffer,
+                  const int fd,
+                  const char* filename,
+                  const long long int file_num_bytes,
+                  const int num_threads,
+                  const bool validate);
+
+    void run(const int tid,
+             std::unique_ptr<aio_context>& aio_ctxt,
+             deepspeed_aio_config_t* aio_config);
+
+    char* data_ptr() const;
+
+    void validate();
+
+    void fini();
+
+    void _read_file(const int tid);
+
+    void _write_file(const int tid);
+
+    void _report_error(const ssize_t return_code, const int error_num, const off_t offset);
+};
+
+int register_buffer(const torch::Tensor& buffer);
+
+int deregister_buffer(const torch::Tensor& buffer);
diff --git a/csrc/aio/py_lib/deepspeed_gds_utils.h b/csrc/aio/py_lib/deepspeed_gds_utils.h
new file mode 100644
index 000000000000..12b014d90988
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_gds_utils.h
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cstring>
+
+// CUDA/cuFile includes
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "cufile.h"
+
+// Macro for checking cuda errors following a cuda launch or api call
+#define cudaCheckError()                                                                     \
+    {                                                                                        \
+        cudaError_t e = cudaGetLastError();                                                  \
+        if (e != cudaSuccess) {                                                              \
+            printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+            exit(EXIT_FAILURE);                                                              \
+        }                                                                                    \
+    }
+
+#define check_cudadrivercall(fn)                                                           \
+    do {                                                                                   \
+        CUresult res = fn;                                                                 \
+        if (res != CUDA_SUCCESS) {                                                         \
+            const char* str = nullptr;                                                     \
+            cuGetErrorName(res, &str);                                                     \
+            std::cerr << "cuda driver api call failed " << #fn << " res : " << res << ", " \
+                      << __LINE__ << ":" << str << std::endl;                              \
+            std::cerr << "EXITING program!!!" << std::endl;                                \
+            exit(1);                                                                       \
+        }                                                                                  \
+    } while (0)
+
+#define check_cudaruntimecall(fn)                                                         \
+    do {                                                                                  \
+        cudaError_t res = fn;                                                             \
+        if (res != cudaSuccess) {                                                         \
+            const char* str = cudaGetErrorName(res);                                      \
+            std::cerr << "cuda runtime api call failed " << #fn << __LINE__ << ":" << str \
+                      << std::endl;                                                       \
+            std::cerr << "EXITING program!!!" << std::endl;                               \
+            exit(1);                                                                      \
+        }                                                                                 \
+    } while (0)
+
+#define check_cuFileCall(fn, api_msg)                                                  \
+    do {                                                                               \
+        CUfileError_t status = fn;                                                     \
+        if (status.err != CU_FILE_SUCCESS) {                                           \
+            std::cout << api_msg << " failed with error " << CUFILE_ERRSTR(status.err) \
+                      << std::endl;                                                    \
+            exit(EXIT_FAILURE);                                                        \
+        }                                                                              \
+    } while (0)
+
+//
+// cuda driver error description
+//
+static inline const char* GetCuErrorString(CUresult curesult)
+{
+    const char* descp;
+    if (cuGetErrorName(curesult, &descp) != CUDA_SUCCESS) descp = "unknown cuda error";
+    return descp;
+}
+
+//
+// cuFile APIs return both cuFile specific error codes as well as POSIX error codes
+// for ease, the below template can be used for getting the error description depending
+// on its type.
+
+// POSIX
+template <class T,
+          typename std::enable_if<std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
+std::string cuFileGetErrorString(T status)
+{
+    status = std::abs(status);
+    return IS_CUFILE_ERR(status) ? std::string(CUFILE_ERRSTR(status))
+                                 : std::string(std::strerror(status));
+}
+
+// CUfileError_t
+template <class T,
+          typename std::enable_if<!std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
+std::string cuFileGetErrorString(T status)
+{
+    std::string errStr = cuFileGetErrorString(static_cast<int>(status.err));
+    if (IS_CUDA_ERR(status)) errStr.append(".").append(GetCuErrorString(status.cu_err));
+    return errStr;
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 387b713f2bfc..30b6682ada72 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -4,9 +4,6 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
index 11d5225de9f1..ba794db5440d 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -4,10 +4,7 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+Functionality for swapping tensors to/from (NVMe) storage devices.
 */
 
 #include <deepspeed_aio_common.h>
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index c21e92de9449..b4dc0534fd15 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -4,9 +4,6 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
@@ -14,16 +11,38 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 using namespace std;
 
+bool deepspeed_aio_handle_t::s_cuFile_init = false;
+
 static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
 
+static std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
+                                                               const torch::Tensor& buffer,
+                                                               const int fd,
+                                                               const char* filename,
+                                                               const long long int file_num_bytes,
+                                                               const int num_threads,
+                                                               const bool validate,
+                                                               const bool use_gds)
+{
+    if (buffer.is_cuda() && use_gds) {
+        return std::make_shared<gds_op_desc_t>(
+            read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+    } else {
+        return std::make_shared<cpu_op_desc_t>(
+            read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+    }
+}
+
 deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
+                                               const bool use_gds,
                                                const int num_threads)
     : _aio_ctxt(new aio_context(block_size, queue_depth)),
       _single_submit(single_submit),
       _overlap_events(overlap_events),
+      _use_gds(use_gds),
       _num_threads(num_threads),
       _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
       _num_pending_ops(0),
@@ -36,6 +55,12 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
     for (auto& ctxt : _thread_contexts) {
         _threads.push_back(std::thread(_start_aio_thread, ctxt));
     }
+
+    if (!deepspeed_aio_handle_t::s_cuFile_init) {
+        cuFileDriverOpen();
+        cudaCheckError();
+        deepspeed_aio_handle_t::s_cuFile_init = true;
+    }
 }
 
 deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
@@ -58,6 +83,8 @@ const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_su
 
 const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
 
+const bool deepspeed_aio_handle_t::get_use_gds() const { return _use_gds; }
+
 const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
 
 int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
@@ -179,16 +206,12 @@ int deepspeed_aio_handle_t::wait()
     while (_num_pending_ops > 0) {
         auto completed_op = _wait_for_aio_work();
 
+        if (completed_op->_validate) { completed_op->validate(); }
+
         completed_op->fini();
 
         close(completed_op->_fd);
 
-        if (completed_op->_validate) {
-            validate_aio_operation(completed_op->_read_op,
-                                   completed_op->_filename.c_str(),
-                                   completed_op->data_ptr(),
-                                   _num_threads * completed_op->_num_bytes);
-        }
         --_num_pending_ops;
         ++num_completed_ops;
     }
@@ -201,7 +224,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
 {
     const auto op_string = read_op ? "Read" : "Write";
     if (num_bytes % get_thread_count()) {
-        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+        std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
                   << " not divisible by thread count = " << get_thread_count() << std::endl;
         return false;
     }
@@ -233,8 +256,8 @@ int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
     const auto fd = open_file(filename, true);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
+    auto scheduled_op = _create_io_op_desc(
+        true, buffer, fd, filename, num_file_bytes, _num_threads, validate, _use_gds);
 
     _schedule_aio_work(scheduled_op);
 
@@ -248,6 +271,7 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
                                    const bool validate,
                                    const bool async)
 {
+
     const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
     assert((num_write_bytes % _num_threads) == 0);
 
@@ -256,8 +280,8 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
     const auto fd = open_file(filename, false);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
+    auto scheduled_op = _create_io_op_desc(
+        false, buffer, fd, filename, num_write_bytes, _num_threads, validate, _use_gds);
 
     _schedule_aio_work(scheduled_op);
 
@@ -296,3 +320,13 @@ bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor
 {
     return _pinned_tensor_mgr->free(locked_tensor);
 }
+
+int deepspeed_aio_handle_t::new_device_locked_tensor(const torch::Tensor& buffer)
+{
+    return register_buffer(buffer);
+}
+
+int deepspeed_aio_handle_t::free_device_locked_tensor(const torch::Tensor& buffer)
+{
+    return deregister_buffer(buffer);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 3a254c3814a2..db11a81426b6 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -16,8 +16,10 @@ struct deepspeed_aio_handle_t {
     std::unique_ptr<struct aio_context> _aio_ctxt;
     const bool _single_submit;
     const bool _overlap_events;
+    const bool _use_gds;
     const int _num_threads;
     deepspeed_aio_config_t _aio_config;
+    static bool s_cuFile_init;
 
     std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
     std::vector<std::thread> _threads;
@@ -28,6 +30,7 @@ struct deepspeed_aio_handle_t {
                            const int queue_depth,
                            const bool single_submit,
                            const bool overlap_events,
+                           const bool use_gds,
                            const int num_threads);
 
     ~deepspeed_aio_handle_t();
@@ -36,6 +39,7 @@ struct deepspeed_aio_handle_t {
     const int get_queue_depth() const;
     const bool get_single_submit() const;
     const bool get_overlap_events() const;
+    const bool get_use_gds() const;
     const int get_thread_count() const;
 
     int read(torch::Tensor& buffer, const char* filename, const bool validate);
@@ -65,6 +69,10 @@ struct deepspeed_aio_handle_t {
 
     bool free_cpu_locked_tensor(torch::Tensor&);
 
+    int new_device_locked_tensor(const torch::Tensor& example_tensor);
+
+    int free_device_locked_tensor(const torch::Tensor& example_tensor);
+
     int wait();
 
     void _stop_threads();
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index 8a59107dd347..561c46f7c287 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -4,7 +4,7 @@
 // DeepSpeed Team
 
 /*
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+Functionality for swapping tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_py_copy.h"
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
index 19ba28317d00..f443571a3e7b 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -4,9 +4,6 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index 9033549bc0d2..8e1736013d6e 100755
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -10,6 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <torch/extension.h>
 #include "deepspeed_py_aio_handle.h"
 #include "deepspeed_py_copy.h"
+using namespace pybind11::literals;
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
@@ -20,12 +21,20 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
 
     py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
-        .def(py::init<const int, const int, const bool, const bool, const int>())
+        .def(py::init<const int, const int, const bool, const bool, const bool, const int>(),
+             "AIO handle constructor",
+             "block_size"_a,
+             "queue_depth"_a,
+             "single_submit"_a,
+             "overlap_events"_a,
+             "use_gds"_a,
+             "num_threads"_a)
 
         .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
         .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
         .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
+        .def("get_use_gds", &deepspeed_aio_handle_t::get_use_gds)
         .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
 
         .def("read", &deepspeed_aio_handle_t::read)
@@ -41,6 +50,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 
         .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
         .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
+        .def("new_device_locked_tensor", &deepspeed_aio_handle_t::new_device_locked_tensor)
+        .def("free_device_locked_tensor", &deepspeed_aio_handle_t::free_device_locked_tensor)
 
         .def("wait", &deepspeed_aio_handle_t::wait);
 }
diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py
index 66d843d68ea2..0e018063b10a 100644
--- a/csrc/aio/py_test/ds_aio_args.py
+++ b/csrc/aio/py_test/ds_aio_args.py
@@ -83,6 +83,11 @@ def validate_args(args):
         no_error = no_error and no_mapping_error
         error_messages += mapping_error_messages
 
+    # Validate --gpu, --use_gds
+    if args.use_gds and not args.gpu:
+        error_messages.append(f'--gpu must be set to transfer with --use_gds')
+        no_error = False
+
     if not no_error:
         print(f'Found {len(error_messages)} validation errors')
         for i, msg in enumerate(error_messages):
@@ -141,6 +146,8 @@ def parse_arguments():
 
     parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
 
+    parser.add_argument('--use_gds', action='store_true', help='Enable GDS AIO')
+
     parser.add_argument('--slow_bounce_buffer',
                         action='store_true',
                         help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 369cb9d4030f..a7600a033002 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -20,6 +20,7 @@
 
 def pre_handle(args, tid, read_op):
     io_string = "Read" if read_op else "Write"
+    gds = True if args.use_gds else False
     device_id, folder = args.mapping_list[tid]
     filename = create_filename(folder, args.read, args.io_size, tid)
     if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
@@ -30,7 +31,7 @@ def pre_handle(args, tid, read_op):
     if args.gpu:
         device_name = get_accelerator().device_name(device_id)
         buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
-        if not args.slow_bounce_buffer:
+        if not (args.slow_bounce_buffer or gds):
             bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
                                           device='cpu').pin_memory()
     else:
@@ -41,7 +42,9 @@ def pre_handle(args, tid, read_op):
 
     io_parallel = args.io_parallel if args.io_parallel else 1
     handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                not args.sequential_requests, io_parallel)
+                                                not args.sequential_requests, gds,io_parallel)
+    if gds:
+        handle.new_device_locked_tensor(buffer)
     task_log(tid, f'created deepspeed aio handle')
 
     ctxt = {}
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index 56c81f41eb70..2590ad92bd27 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -24,6 +24,7 @@ IO_SIZE=$1
 LOG_DIR=$2/aio_perf_sweep
 MAP_DIR=$2/aio
 GPU_MEM=$3
+USE_GDS=$4
 RUN_SCRIPT=./test_ds_aio.py
 READ_OPT="--read"
 
@@ -38,6 +39,11 @@ if [[ ${GPU_MEM} == "gpu" ]]; then
 else
     gpu_opt=""
 fi
+if [[ ${USE_GDS} == "gds" ]]; then
+    gds_opt="--use_gds"
+else
+    gds_opt=""
+fi
 
 DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
@@ -58,7 +64,7 @@ for sub in single block; do
             for t in 1 2 4 8; do
                 for d in 16 32 64; do
                     for bs in 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}"
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
                         OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                         cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
index d8abc6869c50..544be4e5a0c2 100755
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -25,15 +25,11 @@ function validate_environment()
 
 validate_environment
 
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <write size in MB> <output log dir><gpu(optional>"
-    exit 1
-fi
-
 IO_SIZE=$1
 LOG_DIR=$2/aio_perf_sweep
 MAP_DIR=$2/aio
 GPU_MEM=$3
+USE_GDS=$4
 RUN_SCRIPT=./test_ds_aio.py
 
 OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt
@@ -49,6 +45,11 @@ if [[ ${GPU_MEM} == "gpu" ]]; then
 else
     gpu_opt=""
 fi
+if [[ ${USE_GDS} == "gds" ]]; then
+    gds_opt="--use_gds"
+else
+    gds_opt=""
+fi
 
 DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
@@ -69,7 +70,7 @@ for sub in single block; do
             for t in 1 2 4 8; do
                 for d in 16 32 64; do
                     for bs in 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} --folder ${MAP_DIR}"
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt}--folder ${MAP_DIR}"
                         OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                         cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index b55c821910b9..e998daa2c376 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -3,13 +3,14 @@
 
 # DeepSpeed Team
 
+import os
 import distutils.spawn
 import subprocess
 
-from .builder import OpBuilder
+from .builder import TorchCPUOpBuilder
 
 
-class AsyncIOBuilder(OpBuilder):
+class AsyncIOBuilder(TorchCPUOpBuilder):
     BUILD_VAR = "DS_BUILD_AIO"
     NAME = "async_io"
 
@@ -25,38 +26,45 @@ def sources(self):
             'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
             'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
             'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
-            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+            'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_gds_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
 
     def include_paths(self):
-        return ['csrc/aio/py_lib', 'csrc/aio/common']
+        import torch
+        if self.build_for_cpu:
+            CUDA_INCLUDE = []
+        elif not self.is_rocm_pytorch():
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
+            ]
+        return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE
 
     def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
-        CPU_ARCH = self.cpu_arch()
-        SIMD_WIDTH = self.simd_width()
-        import torch  # Keep this import here to avoid errors when building DeepSpeed wheel without torch installed
-        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
-        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
-            CPP_STD = '-std=c++17'
-        else:
-            CPP_STD = '-std=c++14'
-        return [
-            '-g',
+        args = super().cxx_args()
+        args += [
             '-Wall',
             '-O0',
-            CPP_STD,
             '-shared',
             '-fPIC',
             '-Wno-reorder',
-            CPU_ARCH,
-            '-fopenmp',
-            SIMD_WIDTH,
-            '-laio',
         ]
 
+        return args
+
     def extra_ldflags(self):
-        return ['-laio']
+        if self.build_for_cpu:
+            return ['-fopenmp']
+
+        import torch.utils.cpp_extension
+        CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
+        CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
+        return [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart', '-lcufile']
 
     def check_for_libaio_pkg(self):
         libs = dict(
@@ -85,7 +93,7 @@ def is_compatible(self, verbose=True):
         # which is a function provided by libaio that is used in the async_io op.
         # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
         # respectively to specify the directories for libaio.h and libaio.so.
-        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
+        aio_compatible = self.has_function('io_submit', ('aio', ))
         if verbose and not aio_compatible:
             self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
 

From 084e03e83b246c6b5621cebd5fb1690d340cbc0f Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Thu, 23 May 2024 23:05:29 +0000
Subject: [PATCH 04/31] setting gds block size

---
 csrc/aio/py_lib/deepspeed_gds_op.cpp        | 2 --
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 6 ++++++
 csrc/aio/py_test/ds_aio_handle.py           | 4 +++-
 csrc/aio/py_test/run_read_sweep.sh          | 4 ++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp
index 077f0be84c8e..8b4f8be6e22f 100644
--- a/csrc/aio/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp
@@ -65,8 +65,6 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
         }
         exit(EXIT_FAILURE);
     }
-    // _base_ptr = _contiguous_buffer.data_ptr();
-
     check_cudaruntimecall(cudaSetDevice(device));
 
     _safe_handle_register(fd, _cf_descr, _cf_handle);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index b4dc0534fd15..f2b90919ef15 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -59,6 +59,12 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
     if (!deepspeed_aio_handle_t::s_cuFile_init) {
         cuFileDriverOpen();
         cudaCheckError();
+        size_t direct_io_size = (size_t)block_size / 1024;
+        CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size);
+        if (status.err != CU_FILE_SUCCESS) {
+            std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+            exit(EXIT_FAILURE);
+        }
         deepspeed_aio_handle_t::s_cuFile_init = true;
     }
 }
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index a7600a033002..9e55ae6cacf1 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -51,6 +51,7 @@ def pre_handle(args, tid, read_op):
     ctxt['file'] = filename
     ctxt['num_bytes'] = args.io_size
     ctxt['handle'] = handle
+    ctxt['gds'] = gds
     ctxt[BUFFER] = buffer
     ctxt[BOUNCE_BUFFER] = bounce_buffer
     ctxt['elapsed_sec'] = 0
@@ -74,6 +75,8 @@ def post_handle(pool_params):
     _, _, ctxt = pool_params
     for buf in [BUFFER, BOUNCE_BUFFER]:
         if ctxt[buf] is not None:
+            if ctxt['gds']:
+                ctxt['handle'].free_device_locked_tensor(ctxt[buf])
             ctxt[buf].detach()
             ctxt[buf] = None
     return ctxt
@@ -92,7 +95,6 @@ def main_parallel_read(pool_params):
         ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
-
     return ctxt
 
 
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index 2590ad92bd27..0f26d718afac 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -62,7 +62,7 @@ for sub in single block; do
         fi
         for p in 1 2 4 8; do
             for t in 1 2 4 8; do
-                for d in 16 32 64; do
+                for d in 32 64 128; do
                     for bs in 256K 512K 1M; do
                         SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
                         OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
@@ -75,7 +75,7 @@ for sub in single block; do
                         eval ${DISABLE_CACHE}
                         eval ${cmd}
                         eval ${SYNC}
-                        sleep 1
+                        sleep 5
                     done
                 done
             done

From cef9af08552edeac8f894de21b1c35c0e3e8e668 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Tue, 28 May 2024 21:24:49 +0000
Subject: [PATCH 05/31] gds working w/threads

---
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 36 ++++++++++++++++-----
 csrc/aio/py_lib/deepspeed_py_aio_handle.h   |  2 +-
 csrc/aio/py_test/ds_aio_args.py             |  2 +-
 csrc/aio/py_test/run_read_sweep.sh          | 27 ++++++++++++----
 4 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index f2b90919ef15..6bd2f6385c7f 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -8,6 +8,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_py_aio_handle.h"
+#include <fstream>
+#include <string>
+#include <cstdlib>
 
 using namespace std;
 
@@ -48,15 +51,20 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
       _num_pending_ops(0),
       _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
 {
-    for (auto i = 0; i < num_threads; ++i) {
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-    }
-
-    for (auto& ctxt : _thread_contexts) {
-        _threads.push_back(std::thread(_start_aio_thread, ctxt));
-    }
 
-    if (!deepspeed_aio_handle_t::s_cuFile_init) {
+    if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) {
+        std::string depthStr = std::to_string(queue_depth);
+        std::string threadsStr = std::to_string(num_threads);
+        std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", ";
+        std::string json2 = R"("max_request_parallelism": )"+threadsStr+", ";
+        std::string json3 = R"("max_io_threads": )"+threadsStr+", ";
+        std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})";
+        std::ofstream outFile("local_cufile.json");
+        if (outFile.is_open()){
+            outFile << json1 + json2 + json3 + json4;
+            outFile.close();
+        } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);}
+        putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
         cuFileDriverOpen();
         cudaCheckError();
         size_t direct_io_size = (size_t)block_size / 1024;
@@ -66,6 +74,17 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
             exit(EXIT_FAILURE);
         }
         deepspeed_aio_handle_t::s_cuFile_init = true;
+        // GDS threads handled internally
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(0, _aio_config));
+        _num_threads = 1;
+    } else { // CPU OP
+        for (auto i = 0; i < num_threads; ++i) {
+            _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+        }
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
     }
 }
 
@@ -73,6 +92,7 @@ deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
 {
     _stop_threads();
     for (auto& thr : _threads) { thr.join(); }
+    if (_use_gds) {cuFileDriverClose();}
 }
 
 const int deepspeed_aio_handle_t::get_block_size() const
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index db11a81426b6..bc3f6818d402 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -17,7 +17,7 @@ struct deepspeed_aio_handle_t {
     const bool _single_submit;
     const bool _overlap_events;
     const bool _use_gds;
-    const int _num_threads;
+    int _num_threads;
     deepspeed_aio_config_t _aio_config;
     static bool s_cuFile_init;
 
diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py
index 0e018063b10a..5fc3098d3357 100644
--- a/csrc/aio/py_test/ds_aio_args.py
+++ b/csrc/aio/py_test/ds_aio_args.py
@@ -140,7 +140,7 @@ def parse_arguments():
 
     parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
 
-    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
+    parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')
 
     parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
 
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index 0f26d718afac..83afe291ec7e 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -17,6 +17,14 @@ function validate_environment()
     fi
 }
 
+function fileExists() {
+    local file="$1"
+    if [[ -f "$file" ]]; then
+        return 0
+    else
+        return 1
+    fi
+}
 
 validate_environment
 
@@ -68,14 +76,19 @@ for sub in single block; do
                         OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                         cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                        echo ${DISABLE_CACHE}
-                        echo ${cmd}
-                        echo ${SYNC}
 
-                        eval ${DISABLE_CACHE}
-                        eval ${cmd}
-                        eval ${SYNC}
-                        sleep 5
+                        if fileExists ${LOG}; then
+                            echo "Log Exists"
+                            sleep 2
+                        else
+                            echo ${DISABLE_CACHE}
+                            echo ${cmd}
+                            echo ${SYNC}
+                            eval ${DISABLE_CACHE}
+                            eval ${cmd}
+                            eval ${SYNC}
+                            sleep 2
+                        fi
                     done
                 done
             done

From 86594a489f1268c7418bc1a12ea1c933e5988ac9 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Thu, 30 May 2024 19:20:39 +0000
Subject: [PATCH 06/31] keeping in case container delete

---
 csrc/aio/py_test/run_read_sweep.sh  | 66 ++++++++++++++---------------
 csrc/aio/py_test/run_write_sweep.sh |  6 +--
 2 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index 83afe291ec7e..ea2bceece148 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -4,6 +4,15 @@ if [[ $# -lt 2 ]]; then
     exit 1
 fi
 
+function prep_folder()
+{
+    folder=$1
+    if [[ -d ${folder} ]]; then
+        rm -f ${folder}/*
+    else
+        mkdir -p ${folder}
+    fi
+}
 
 function validate_environment()
 {
@@ -36,6 +45,9 @@ USE_GDS=$4
 RUN_SCRIPT=./test_ds_aio.py
 READ_OPT="--read"
 
+prep_folder ${MAP_DIR}
+prep_folder ${LOG_DIR}
+
 if [[ -d ${LOG_DIR} ]]; then
     rm -f ${LOG_DIR}/*
 else
@@ -55,43 +67,27 @@ fi
 
 DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
+sub_opt=""
+sub="block"
+ov_opt=""
+ov="overlap"
+t=8
 
-for sub in single block; do
-    if [[ $sub == "single" ]]; then
-        sub_opt="--single_submit"
-    else
-        sub_opt=""
-    fi
-    for ov in overlap sequential; do
-        if [[ $ov == "sequential" ]]; then
-            ov_opt="--sequential_requests"
-        else
-            ov_opt=""
-        fi
-        for p in 1 2 4 8; do
-            for t in 1 2 4 8; do
-                for d in 32 64 128; do
-                    for bs in 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
-                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
-                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+for p in 1 8; do
+    for d in 64 128; do
+        for bs in 8M 16M; do
+            SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
+            OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
+            LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+            cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
 
-                        if fileExists ${LOG}; then
-                            echo "Log Exists"
-                            sleep 2
-                        else
-                            echo ${DISABLE_CACHE}
-                            echo ${cmd}
-                            echo ${SYNC}
-                            eval ${DISABLE_CACHE}
-                            eval ${cmd}
-                            eval ${SYNC}
-                            sleep 2
-                        fi
-                    done
-                done
-            done
+            echo ${DISABLE_CACHE}
+            echo ${cmd}
+            echo ${SYNC}
+            eval ${DISABLE_CACHE}
+            eval ${cmd}
+            eval ${SYNC}
+            sleep 2
         done
     done
 done
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
index 544be4e5a0c2..a54d1c8d7bed 100755
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -68,9 +68,9 @@ for sub in single block; do
         fi
         for p in 1 2 4 8; do
             for t in 1 2 4 8; do
-                for d in 16 32 64; do
+                for d in 32 64 128; do
                     for bs in 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt}--folder ${MAP_DIR}"
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
                         OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                         cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
@@ -81,7 +81,7 @@ for sub in single block; do
                         eval ${DISABLE_CACHE}
                         eval ${cmd}
                         eval ${SYNC}
-                        sleep 1
+                        sleep 2
                     done
                 done
         done

From d13b1ab87004348e779a0db1d8d5ffb9cd6f89c6 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Thu, 30 May 2024 21:49:26 +0000
Subject: [PATCH 07/31] ftd in read sweep

---
 csrc/aio/py_test/run_read_sweep.sh | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index ea2bceece148..1036e6bdb4aa 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -72,22 +72,21 @@ sub="block"
 ov_opt=""
 ov="overlap"
 t=8
+p=8
 
-for p in 1 8; do
-    for d in 64 128; do
-        for bs in 8M 16M; do
-            SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
-            OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
-            LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-            cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+for d in 64 128; do
+    for bs in 8M 16M; do
+        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /workspace/nvme03:0 /workspace/nvme03:1 /workspace/nvme03:2 /workspace/nvme03:3 /workspace/nvme47:4 /workspace/nvme47:5 /workspace/nvme47:6 /workspace/nvme47:7"
+        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
+        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
 
-            echo ${DISABLE_CACHE}
-            echo ${cmd}
-            echo ${SYNC}
-            eval ${DISABLE_CACHE}
-            eval ${cmd}
-            eval ${SYNC}
-            sleep 2
-        done
+        echo ${DISABLE_CACHE}
+        echo ${cmd}
+        echo ${SYNC}
+        eval ${DISABLE_CACHE}
+        eval ${cmd}
+        eval ${SYNC}
+        sleep 2
     done
 done

From 4d9c27e864f609e5d2d83f3fbd8dddbedc211563 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Wed, 3 Jul 2024 18:47:28 +0000
Subject: [PATCH 08/31] changes in master to make it run

---
 csrc/aio/py_test/run_read_sweep.sh    | 18 +++++++++---------
 deepspeed/elasticity/elastic_agent.py |  7 +++++--
 requirements/requirements.txt         |  2 +-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index 1036e6bdb4aa..f474791af668 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -16,7 +16,7 @@ function prep_folder()
 
 function validate_environment()
 {
-    validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python ./validate_async_io.py"
+    validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python3 ./validate_async_io.py"
     eval ${validate_cmd}
     res=$?
     if [[ $res != 0 ]]; then
@@ -38,12 +38,12 @@ function fileExists() {
 validate_environment
 
 IO_SIZE=$1
-LOG_DIR=$2/aio_perf_sweep
+LOG_DIR=./1nvme_cpu_write
 MAP_DIR=$2/aio
 GPU_MEM=$3
 USE_GDS=$4
 RUN_SCRIPT=./test_ds_aio.py
-READ_OPT="--read"
+READ_OPT=""
 
 prep_folder ${MAP_DIR}
 prep_folder ${LOG_DIR}
@@ -65,21 +65,21 @@ else
     gds_opt=""
 fi
 
-DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
-SYNC="sync"
+DISABLE_CACHE="sudo sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+SYNC="sudo sync"
 sub_opt=""
 sub="block"
 ov_opt=""
 ov="overlap"
 t=8
-p=8
+p=1
 
 for d in 64 128; do
     for bs in 8M 16M; do
-        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /workspace/nvme03:0 /workspace/nvme03:1 /workspace/nvme03:2 /workspace/nvme03:3 /workspace/nvme47:4 /workspace/nvme47:5 /workspace/nvme47:6 /workspace/nvme47:7"
+        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvmed0:0"
         OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
-        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+        LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+        cmd="/usr/bin/time python3 ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
 
         echo ${DISABLE_CACHE}
         echo ${cmd}
diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
index c1e8932ecaba..d079ecd516e2 100644
--- a/deepspeed/elasticity/elastic_agent.py
+++ b/deepspeed/elasticity/elastic_agent.py
@@ -6,7 +6,7 @@
 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
 from typing import Any, Dict, Optional, Tuple
 from datetime import datetime
-from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port
+from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.distributed.elastic.metrics import put_metric
 from torch.distributed.elastic.agent.server.api import (
     RunResult,
@@ -24,6 +24,9 @@
 from contextlib import closing
 import subprocess
 
+from torch.distributed.elastic.utils.logging import get_logger
+
+log = get_logger(__name__)
 
 class DSElasticAgent(LocalElasticAgent):
 
@@ -44,7 +47,7 @@ def _set_master_addr_port(store: Store,
                               master_port: Optional[int],
                               local_addr: Optional[str] = None):
         if master_port is None:
-            sock = _get_socket_with_port()
+            sock = get_free_port() 
             with closing(sock):
                 master_port = sock.getsockname()[1]
 
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 80c9f9b3287a..9b923d94f619 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,6 +1,6 @@
 hjson
 ninja
-numpy
+numpy<2.0.0
 packaging>=20.0
 psutil
 py-cpuinfo

From 7c94fe8c0e6193b87b634d01d3ac0e6d14a64621 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Mon, 22 Jul 2024 23:09:17 +0000
Subject: [PATCH 09/31] compile without gds

---
 csrc/aio/py_lib/deepspeed_gds_op.cpp        | 63 ++++++++++++++++++-
 csrc/aio/py_lib/deepspeed_gds_op.h          | 16 ++++-
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 27 +-------
 csrc/aio/py_test/run_read_sweep.sh          | 68 ++++++++++++++-------
 4 files changed, 124 insertions(+), 50 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp
index 8b4f8be6e22f..34c7282cd897 100644
--- a/csrc/aio/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp
@@ -8,11 +8,36 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_gds_op.h"
-#include <cstdlib>
-#include <set>
 
 using namespace std;
 
+#ifdef __ENABLE_GDS__
+void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads)
+{
+    std::string depthStr = std::to_string(queue_depth);
+    std::string threadsStr = std::to_string(num_threads);
+    std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", ";
+    std::string json2 = R"("max_request_parallelism": )"+threadsStr+", ";
+    std::string json3 = R"("max_io_threads": )"+threadsStr+", ";
+    std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})";
+    std::ofstream outFile("local_cufile.json");
+    if (outFile.is_open()){
+        outFile << json1 + json2 + json3 + json4;
+        outFile.close();
+    } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);}
+    putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
+    cuFileDriverOpen();
+    cudaCheckError();
+    size_t direct_io_size = (size_t)block_size / 1024;
+    CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size);
+    if (status.err != CU_FILE_SUCCESS) {
+        std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+};
+
+void close_gds() {cuFileDriverClose();}
+
 // For when there is more than 1 device
 // static std::set<char*> base_buffer_registry;
 static std::map<const int64_t, std::set<void*>> base_ptr_registry;
@@ -158,3 +183,37 @@ int deregister_buffer(const torch::Tensor& buffer)
     base_ptr_registry[device].erase(reg_ptr);
     return 0;
 }
+#else
+void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads)
+{
+    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
+    exit(EXIT_FAILURE);
+};
+void close_gds()
+{
+    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
+    exit(EXIT_FAILURE);
+};
+gds_op_desc_t::gds_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const long long int file_num_bytes,
+                             const int num_threads,
+                             const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate)
+{
+    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
+    exit(EXIT_FAILURE);
+};
+int register_buffer(const torch::Tensor& buffer)
+{
+    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
+    exit(EXIT_FAILURE);
+};
+int deregister_buffer(const torch::Tensor& buffer)
+{
+    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
+    exit(EXIT_FAILURE);
+};
+#endif
diff --git a/csrc/aio/py_lib/deepspeed_gds_op.h b/csrc/aio/py_lib/deepspeed_gds_op.h
index 21f466ecac12..1e955aa67558 100644
--- a/csrc/aio/py_lib/deepspeed_gds_op.h
+++ b/csrc/aio/py_lib/deepspeed_gds_op.h
@@ -5,13 +5,21 @@
 
 #include <memory>
 #include <queue>
+#include <fstream>
+#include <string>
+#include <cstdlib>
+#include <set>
 
 #include "deepspeed_aio_op_desc.h"
+#ifdef __ENABLE_GDS__
 #include "deepspeed_gds_utils.h"
+#endif
 
 struct gds_op_desc_t : io_op_desc_t {
-    CUfileDescr_t _cf_descr;
-    CUfileHandle_t _cf_handle;
+    #ifdef __ENABLE_GDS__
+        CUfileDescr_t _cf_descr;
+        CUfileHandle_t _cf_handle;
+    #endif
     void* _base_ptr;
 
     gds_op_desc_t(const bool read_op,
@@ -42,3 +50,7 @@ struct gds_op_desc_t : io_op_desc_t {
 int register_buffer(const torch::Tensor& buffer);
 
 int deregister_buffer(const torch::Tensor& buffer);
+
+void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads);
+
+void close_gds();
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index 6bd2f6385c7f..d968f1f0b25a 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -8,8 +8,6 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_py_aio_handle.h"
-#include <fstream>
-#include <string>
 #include <cstdlib>
 
 using namespace std;
@@ -53,29 +51,10 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
 {
 
     if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) {
-        std::string depthStr = std::to_string(queue_depth);
-        std::string threadsStr = std::to_string(num_threads);
-        std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", ";
-        std::string json2 = R"("max_request_parallelism": )"+threadsStr+", ";
-        std::string json3 = R"("max_io_threads": )"+threadsStr+", ";
-        std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})";
-        std::ofstream outFile("local_cufile.json");
-        if (outFile.is_open()){
-            outFile << json1 + json2 + json3 + json4;
-            outFile.close();
-        } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);}
-        putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
-        cuFileDriverOpen();
-        cudaCheckError();
-        size_t direct_io_size = (size_t)block_size / 1024;
-        CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size);
-        if (status.err != CU_FILE_SUCCESS) {
-            std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
-            exit(EXIT_FAILURE);
-        }
+        init_gds_cufile(block_size, queue_depth, num_threads);
         deepspeed_aio_handle_t::s_cuFile_init = true;
-        // GDS threads handled internally
         _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(0, _aio_config));
+        // GDS threads handled in cufile.json
         _num_threads = 1;
     } else { // CPU OP
         for (auto i = 0; i < num_threads; ++i) {
@@ -92,7 +71,7 @@ deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
 {
     _stop_threads();
     for (auto& thr : _threads) { thr.join(); }
-    if (_use_gds) {cuFileDriverClose();}
+    if (_use_gds) {close_gds();}
 }
 
 const int deepspeed_aio_handle_t::get_block_size() const
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index f474791af668..14fa0027e004 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -38,12 +38,12 @@ function fileExists() {
 validate_environment
 
 IO_SIZE=$1
-LOG_DIR=./1nvme_cpu_write
+LOG_DIR=./aio_perf_sweep
 MAP_DIR=$2/aio
 GPU_MEM=$3
 USE_GDS=$4
 RUN_SCRIPT=./test_ds_aio.py
-READ_OPT=""
+READ_OPT="--read"
 
 prep_folder ${MAP_DIR}
 prep_folder ${LOG_DIR}
@@ -67,26 +67,50 @@ fi
 
 DISABLE_CACHE="sudo sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sudo sync"
-sub_opt=""
-sub="block"
-ov_opt=""
-ov="overlap"
-t=8
-p=1
 
-for d in 64 128; do
-    for bs in 8M 16M; do
-        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvmed0:0"
-        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
-        LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-        cmd="/usr/bin/time python3 ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-
-        echo ${DISABLE_CACHE}
-        echo ${cmd}
-        echo ${SYNC}
-        eval ${DISABLE_CACHE}
-        eval ${cmd}
-        eval ${SYNC}
-        sleep 2
+for xtype in cpu gpu gds; do
+    if [[ $xtype == "cpu" ]]; then
+        gpu_opt=""
+        gds_opt=""
+    elif [[ $xtype == "gpu" ]]; then
+        gpu_opt="--gpu"
+        gds_opt=""
+    else
+        gpu_opt="--gpu"
+        gds_opt="--use_gds"
+    fi
+    for sub in single block; do
+        if [[ $sub == "single" ]]; then
+            sub_opt="--single_submit"
+        else
+            sub_opt=""
+        fi
+        for ov in overlap sequential; do
+            if [[ $ov == "sequential" ]]; then
+                ov_opt="--sequential_requests"
+            else
+                ov_opt=""
+            fi
+            for p in 1 2 4 8; do
+                for t in 1 2 4 8; do
+                    for d in 8 16 32 64 128; do
+                        for bs in 128K 256K 512K 1M 2M 4M 8M 16M; do
+                            SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme03:0"
+                            OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
+                            LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                            cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                    
+                            echo ${DISABLE_CACHE}
+                            echo ${cmd}
+                            echo ${SYNC}
+                            eval ${DISABLE_CACHE}
+                            eval ${cmd}
+                            eval ${SYNC}
+                            sleep 2
+                        done
+                    done
+                done
+            done
+        done
     done
 done

From f02ba09a502c4da4823263541edf0965b66869d6 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Tue, 23 Jul 2024 22:07:01 +0000
Subject: [PATCH 10/31] gds macro

---
 op_builder/async_io.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index e998daa2c376..02f8bcc77faa 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -47,16 +47,21 @@ def include_paths(self):
     def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
         args = super().cxx_args()
+        GDS_ENABLE = self.is_gds_enable()
         args += [
             '-Wall',
             '-O0',
             '-shared',
             '-fPIC',
             '-Wno-reorder',
+            GDS_ENABLE
         ]
 
         return args
 
+    def is_gds_enable():
+        return '-D__ENABLE_GDS__'
+
     def extra_ldflags(self):
         if self.build_for_cpu:
             return ['-fopenmp']

From 7d3ac1002218cd6398cac5876902e4a8b35da9a3 Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Wed, 24 Jul 2024 17:40:20 +0000
Subject: [PATCH 11/31] simple gds+cpu swapper integration

---
 csrc/aio/py_lib/deepspeed_gds_op.cpp          |  1 +
 .../swap_tensor/partitioned_param_swapper.py  | 26 +++++++++++++------
 op_builder/async_io.py                        |  2 +-
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp
index 34c7282cd897..06eb4f78c399 100644
--- a/csrc/aio/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp
@@ -36,6 +36,7 @@ void init_gds_cufile(const int block_size, const int queue_depth, const int num_
     }
 };
 
+// TODO: deregister and release any held onto buffers
 void close_gds() {cuFileDriverClose();}
 
 // For when there is more than 1 device
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index fcc6a272883f..6f09a687d98c 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -39,6 +39,7 @@ def __init__(self, ds_config, model_dtype):
 
         aio_op = AsyncIOBuilder().load(verbose=False)
         self.aio_handle = aio_op.aio_handle
+        self.use_gds = True
         self.dtype = model_dtype
 
         #set swap buffers, create aio handles
@@ -104,19 +105,28 @@ def _configure_aio(self, ds_config):
 
         self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
         self.reserved_buffer_ids = []
-        self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer *
-                                                                    self.param_buffer_count),
-                                                                dtype=self.dtype,
-                                                                requires_grad=False),
-                                                    align_bytes=0)
 
         self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
-                                               self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS],
-                                               self.aio_config[AIO_THREAD_COUNT])
+                                               self.aio_config[AIO_SINGLE_SUBMIT], 
+                                               self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT])
 
         self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
                                                 self.aio_config[AIO_SINGLE_SUBMIT],
-                                                self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT])
+                                                self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT])
+
+        if self.use_gds:
+            self.buffers = torch.empty(int(self.aligned_elements_per_buffer *
+                                           self.param_buffer_count),
+                                       dtype=self.dtype,
+                                       device='cuda', # gotta be cuda
+                                       requires_grad=False)
+            self.aio_read_handle.new_device_locked_tensor(self.buffers)
+        else:
+            self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer *
+                                                                        self.param_buffer_count),
+                                                                    dtype=self.dtype,
+                                                                    requires_grad=False),
+                                                        align_bytes=0)
 
         self.swap_out_params = []
 
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 02f8bcc77faa..a8620387d209 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -59,7 +59,7 @@ def cxx_args(self):
 
         return args
 
-    def is_gds_enable():
+    def is_gds_enable(self):
         return '-D__ENABLE_GDS__'
 
     def extra_ldflags(self):

From 67da243d53f9872eba9eee504075534f543ed31b Mon Sep 17 00:00:00 2001
From: Joe Mayer <jomayeri@microsoft.com>
Date: Thu, 25 Jul 2024 19:35:25 +0000
Subject: [PATCH 12/31] working pytest

---
 csrc/aio/py_lib/deepspeed_gds_op.cpp        |  2 +-
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp |  4 +-
 tests/unit/ops/aio/test_aio.py              | 70 ++++++++++++---------
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/aio/py_lib/deepspeed_gds_op.cpp
index 06eb4f78c399..207477ef455a 100644
--- a/csrc/aio/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_gds_op.cpp
@@ -36,7 +36,7 @@ void init_gds_cufile(const int block_size, const int queue_depth, const int num_
     }
 };
 
-// TODO: deregister and release any held onto buffers
+// TODO: deregister and release all buffers
 void close_gds() {cuFileDriverClose();}
 
 // For when there is more than 1 device
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index d968f1f0b25a..f13cd6dd06db 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -53,8 +53,10 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
     if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) {
         init_gds_cufile(block_size, queue_depth, num_threads);
         deepspeed_aio_handle_t::s_cuFile_init = true;
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(0, _aio_config));
+    }
+    if (use_gds) {
         // GDS threads handled in cufile.json
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(0, _aio_config));
         _num_threads = 1;
     } else { // CPU OP
         for (auto i = 0; i < num_threads; ++i) {
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index f6d175ce67bc..eb6ddd4da8cb 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -13,22 +13,26 @@
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from unit.common import DistributedTest
 
-KILO_BYTE = 1024
+KILO_BYTE = 1024*256
 BLOCK_SIZE = KILO_BYTE
 QUEUE_DEPTH = 2
 IO_SIZE = 4 * BLOCK_SIZE
 IO_PARALLEL = 2
+GDS_ENABLE=True
 
 if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
     pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True)
 
 
-def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True):
+def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True, use_gds=False):
     if not get_accelerator().is_available():
         if use_cuda_device:
             pytest.skip("GPU tensors only supported in CUDA environments.")
         if use_cuda_pinned_tensor:
             pytest.skip("CUDA-pinned tensors only supported in CUDA environments.")
+    if not GDS_ENABLE and use_gds:
+        pytest.skip("GDS not available, won't run GDS case.")
+
 
 
 def _get_local_rank():
@@ -58,7 +62,6 @@ def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
     return test_file, test_buffer
 
 
-def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     if aio_handle is None:
         test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer)))
@@ -70,17 +73,19 @@ def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, ind
     return test_file, test_buffer
 
 
-def _validate_handle_state(handle, single_submit, overlap_events):
+def _validate_handle_state(handle, single_submit, overlap_events, use_gds):
     assert handle.get_single_submit() == single_submit
     assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
+    if use_gds:
+        assert handle.get_thread_count() == 1
+    else:
+        assert handle.get_thread_count() == IO_PARALLEL
     assert handle.get_block_size() == BLOCK_SIZE
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
-
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
-@pytest.mark.parametrize("single_submit", [True, False])
-@pytest.mark.parametrize("overlap_events", [True, False])
+@pytest.mark.parametrize("single_submit", [True,False])
+@pytest.mark.parametrize("overlap_events", [True,False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor, use_gds", [(False,False),(True,False),(False,True)])
 class TestRead(DistributedTest):
     world_size = 1
     reuse_dist_env = True
@@ -89,17 +94,20 @@ class TestRead(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_parallel_read(self, tmpdir, single_submit, overlap_events, use_cuda_pinned_tensor, use_gds):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
+        elif use_gds:
+            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
+            h.new_device_locked_tensor(aio_buffer)
         else:
             aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
 
-        _validate_handle_state(h, single_submit, overlap_events)
+        _validate_handle_state(h, single_submit, overlap_events, use_gds)
 
         ref_file, _ = _do_ref_write(tmpdir)
         read_status = h.sync_pread(aio_buffer, ref_file)
@@ -109,15 +117,17 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over
             ref_buffer = list(f.read())
         assert ref_buffer == aio_buffer.tolist()
 
-        if not use_cuda_pinned_tensor:
+        if use_gds:
+            h.free_device_locked_tensor(aio_buffer)
+        elif not use_cuda_pinned_tensor:
             h.free_cpu_locked_tensor(aio_buffer)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
 
         use_cpu_locked_tensor = False
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
 
         if cuda_device:
             aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
@@ -147,6 +157,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
 @pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
+@pytest.mark.parametrize("use_gds", [False])
 class TestWrite(DistributedTest):
     world_size = 1
     reuse_dist_env = True
@@ -155,11 +166,11 @@ class TestWrite(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
             aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
@@ -180,12 +191,12 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove
         assert filecmp.cmp(ref_file, aio_file, shallow=False)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
         use_cpu_locked_tensor = False
         if cuda_device:
             aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
@@ -215,6 +226,7 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 @pytest.mark.sequential
 @pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
 @pytest.mark.parametrize("cuda_device", [True, False])
+@pytest.mark.parametrize("use_gds", [False])
 class TestAsyncQueue(DistributedTest):
     world_size = 1
     requires_cuda_env = False
@@ -223,8 +235,8 @@ class TestAsyncQueue(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, use_gds):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
 
         ref_files = []
         for i in range(async_queue):
@@ -233,7 +245,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
 
         use_cpu_locked_tensor = False
         if cuda_device:
@@ -270,8 +282,8 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
                 h.free_cpu_locked_tensor(t)
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device, use_gds):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
 
         ref_files = []
         ref_buffers = []
@@ -282,7 +294,7 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, useg_gds,IO_PARALLEL)
 
         aio_files = []
         aio_buffers = []

From 048729db9ec6a820afdc4005686453d281d4b716 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 5 Aug 2024 18:27:42 -0400
Subject: [PATCH 13/31] Make GDS first class operator (#640)

* Simplify GDS integration

* Make GDS first class op

* GDS op cleanup

* gds_handle correctness

* Fix unit tests

* Fix gds bug in param_swapper

---------

Co-authored-by: Joe Mayer <jomayeri@microsoft.com>
---
 csrc/aio/py_lib/deepspeed_aio_thread.h        |   1 -
 csrc/aio/py_lib/deepspeed_cpu_op.cpp          |   9 +-
 csrc/aio/py_lib/deepspeed_cpu_op.h            |   1 +
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp   |  72 ++----
 csrc/aio/py_lib/deepspeed_py_aio_handle.h     |  17 +-
 csrc/aio/py_lib/deepspeed_py_copy.cpp         |   2 +-
 csrc/aio/py_lib/py_ds_aio.cpp                 |   6 +-
 csrc/aio/py_test/ds_aio_handle.py             |   2 +-
 csrc/aio/py_test/run_read_sweep.sh            |   2 +-
 ...cated_tile_access_iterator_residual_last.h |   8 +-
 csrc/{aio => gds}/py_lib/deepspeed_gds_op.cpp | 201 +++++----------
 csrc/{aio => gds}/py_lib/deepspeed_gds_op.h   |  28 +-
 .../{aio => gds}/py_lib/deepspeed_gds_utils.h |   0
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp   |  95 +++++++
 csrc/gds/py_lib/deepspeed_py_gds_handle.h     |  40 +++
 csrc/gds/py_lib/py_ds_gds.cpp                 |  48 ++++
 csrc/gds/py_test/validate_gds.py              |  10 +
 csrc/includes/simd.h                          |   2 +-
 csrc/xpu/includes/simd.h                      |   2 +-
 csrc/xpu/includes/type_shim.h                 |  10 +-
 deepspeed/elasticity/elastic_agent.py         |   1 +
 deepspeed/ops/gds/__init__.py                 |   6 +
 deepspeed/runtime/swap_tensor/aio_config.py   |  14 +-
 deepspeed/runtime/swap_tensor/constants.py    |   5 +-
 .../swap_tensor/partitioned_param_swapper.py  |  19 +-
 op_builder/async_io.py                        |  30 +--
 op_builder/builder.py                         |   5 +-
 op_builder/gds.py                             |  50 ++++
 tests/unit/ops/aio/test_aio.py                |  74 +++---
 tests/unit/ops/aio/test_gds.py                | 244 ++++++++++++++++++
 30 files changed, 690 insertions(+), 314 deletions(-)
 rename csrc/{aio => gds}/py_lib/deepspeed_gds_op.cpp (54%)
 rename csrc/{aio => gds}/py_lib/deepspeed_gds_op.h (68%)
 rename csrc/{aio => gds}/py_lib/deepspeed_gds_utils.h (100%)
 create mode 100644 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
 create mode 100644 csrc/gds/py_lib/deepspeed_py_gds_handle.h
 create mode 100644 csrc/gds/py_lib/py_ds_gds.cpp
 create mode 100644 csrc/gds/py_test/validate_gds.py
 mode change 100755 => 100644 csrc/xpu/includes/simd.h
 create mode 100755 deepspeed/ops/gds/__init__.py
 create mode 100644 op_builder/gds.py
 create mode 100644 tests/unit/ops/aio/test_gds.py

diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
index 3cb3c5c3731f..a192804db13d 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -11,7 +11,6 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <memory>
 #include <queue>
 #include "deepspeed_cpu_op.h"
-#include "deepspeed_gds_op.h"
 
 struct thread_sync_t {
     std::mutex _mutex;
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index 6a1696598ed8..767ad5d905e0 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -17,7 +17,9 @@ cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
     : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
       _cpu_buffer(buffer)
 {
-    if (_buffer.is_cuda()) {
+    // Need to use CPU bounce buffer if buffer is not a page-locked DRAM memory.
+    _use_bounce_buffer = !(_buffer.is_cpu() && _buffer.is_pinned());
+    if (_use_bounce_buffer) {
         if (_read_op) {
             auto options = torch::TensorOptions()
                                .dtype(_buffer.dtype())
@@ -28,7 +30,6 @@ cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
             _cpu_buffer = _buffer.to(torch::kCPU).pin_memory();
         }
     }
-
     _contiguous_buffer = _cpu_buffer.contiguous();
 }
 
@@ -36,7 +37,9 @@ char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_pt
 
 void cpu_op_desc_t::fini()
 {
-    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+    if (_read_op) {
+        if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+    }
 }
 
 void cpu_op_desc_t::validate()
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
index d61fe4f3c545..07a4369674fc 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.h
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -9,6 +9,7 @@
 
 struct cpu_op_desc_t : io_op_desc_t {
     torch::Tensor _cpu_buffer;
+    bool _use_bounce_buffer;
 
     cpu_op_desc_t(const bool read_op,
                   const torch::Tensor& buffer,
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index f13cd6dd06db..a6a68ee1a1d0 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -12,56 +12,23 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 using namespace std;
 
-bool deepspeed_aio_handle_t::s_cuFile_init = false;
-
 static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
 
-static std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
-                                                               const torch::Tensor& buffer,
-                                                               const int fd,
-                                                               const char* filename,
-                                                               const long long int file_num_bytes,
-                                                               const int num_threads,
-                                                               const bool validate,
-                                                               const bool use_gds)
-{
-    if (buffer.is_cuda() && use_gds) {
-        return std::make_shared<gds_op_desc_t>(
-            read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
-    } else {
-        return std::make_shared<cpu_op_desc_t>(
-            read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
-    }
-}
-
 deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
-                                               const bool use_gds,
                                                const int num_threads)
     : _aio_ctxt(new aio_context(block_size, queue_depth)),
       _single_submit(single_submit),
       _overlap_events(overlap_events),
-      _use_gds(use_gds),
       _num_threads(num_threads),
       _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
       _num_pending_ops(0),
       _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
 {
-
-    if (!deepspeed_aio_handle_t::s_cuFile_init && use_gds) {
-        init_gds_cufile(block_size, queue_depth, num_threads);
-        deepspeed_aio_handle_t::s_cuFile_init = true;
-    }
-    if (use_gds) {
-        // GDS threads handled in cufile.json
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(0, _aio_config));
-        _num_threads = 1;
-    } else { // CPU OP
-        for (auto i = 0; i < num_threads; ++i) {
-            _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-        }
+    for (auto i = 0; i < num_threads; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
     }
 
     for (auto& ctxt : _thread_contexts) {
@@ -73,7 +40,6 @@ deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
 {
     _stop_threads();
     for (auto& thr : _threads) { thr.join(); }
-    if (_use_gds) {close_gds();}
 }
 
 const int deepspeed_aio_handle_t::get_block_size() const
@@ -90,8 +56,6 @@ const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_su
 
 const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
 
-const bool deepspeed_aio_handle_t::get_use_gds() const { return _use_gds; }
-
 const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
 
 int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
@@ -239,6 +203,19 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
     return true;
 }
 
+std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_create_io_op_desc(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const int fd,
+    const char* filename,
+    const long long int file_num_bytes,
+    const int num_threads,
+    const bool validate)
+{
+    return std::make_shared<cpu_op_desc_t>(
+        read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+}
+
 int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
                                   const char* filename,
                                   const bool validate,
@@ -263,8 +240,8 @@ int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
     const auto fd = open_file(filename, true);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = _create_io_op_desc(
-        true, buffer, fd, filename, num_file_bytes, _num_threads, validate, _use_gds);
+    auto scheduled_op =
+        _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, _num_threads, validate);
 
     _schedule_aio_work(scheduled_op);
 
@@ -278,7 +255,6 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
                                    const bool validate,
                                    const bool async)
 {
-
     const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
     assert((num_write_bytes % _num_threads) == 0);
 
@@ -287,8 +263,8 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
     const auto fd = open_file(filename, false);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op = _create_io_op_desc(
-        false, buffer, fd, filename, num_write_bytes, _num_threads, validate, _use_gds);
+    auto scheduled_op =
+        _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, _num_threads, validate);
 
     _schedule_aio_work(scheduled_op);
 
@@ -327,13 +303,3 @@ bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor
 {
     return _pinned_tensor_mgr->free(locked_tensor);
 }
-
-int deepspeed_aio_handle_t::new_device_locked_tensor(const torch::Tensor& buffer)
-{
-    return register_buffer(buffer);
-}
-
-int deepspeed_aio_handle_t::free_device_locked_tensor(const torch::Tensor& buffer)
-{
-    return deregister_buffer(buffer);
-}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index bc3f6818d402..180d9aba2f9c 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -16,10 +16,8 @@ struct deepspeed_aio_handle_t {
     std::unique_ptr<struct aio_context> _aio_ctxt;
     const bool _single_submit;
     const bool _overlap_events;
-    const bool _use_gds;
     int _num_threads;
     deepspeed_aio_config_t _aio_config;
-    static bool s_cuFile_init;
 
     std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
     std::vector<std::thread> _threads;
@@ -30,7 +28,6 @@ struct deepspeed_aio_handle_t {
                            const int queue_depth,
                            const bool single_submit,
                            const bool overlap_events,
-                           const bool use_gds,
                            const int num_threads);
 
     ~deepspeed_aio_handle_t();
@@ -39,7 +36,6 @@ struct deepspeed_aio_handle_t {
     const int get_queue_depth() const;
     const bool get_single_submit() const;
     const bool get_overlap_events() const;
-    const bool get_use_gds() const;
     const int get_thread_count() const;
 
     int read(torch::Tensor& buffer, const char* filename, const bool validate);
@@ -69,10 +65,6 @@ struct deepspeed_aio_handle_t {
 
     bool free_cpu_locked_tensor(torch::Tensor&);
 
-    int new_device_locked_tensor(const torch::Tensor& example_tensor);
-
-    int free_device_locked_tensor(const torch::Tensor& example_tensor);
-
     int wait();
 
     void _stop_threads();
@@ -82,4 +74,13 @@ struct deepspeed_aio_handle_t {
     std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
 
     bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+
+    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(
+        const bool read_op,
+        const torch::Tensor& buffer,
+        const int fd,
+        const char* filename,
+        const long long int file_num_bytes,
+        const int num_threads,
+        const bool validate);
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index 561c46f7c287..f5480e9d9d83 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_copy.h"
 #include <omp.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__) or defined(__AVX256__)
 union AVX_Data {
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index 8e1736013d6e..78e91c1607b1 100755
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -21,20 +21,18 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
 
     py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
-        .def(py::init<const int, const int, const bool, const bool, const bool, const int>(),
+        .def(py::init<const int, const int, const bool, const bool, const int>(),
              "AIO handle constructor",
              "block_size"_a,
              "queue_depth"_a,
              "single_submit"_a,
              "overlap_events"_a,
-             "use_gds"_a,
              "num_threads"_a)
 
         .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
         .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
         .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
-        .def("get_use_gds", &deepspeed_aio_handle_t::get_use_gds)
         .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
 
         .def("read", &deepspeed_aio_handle_t::read)
@@ -50,8 +48,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 
         .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
         .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
-        .def("new_device_locked_tensor", &deepspeed_aio_handle_t::new_device_locked_tensor)
-        .def("free_device_locked_tensor", &deepspeed_aio_handle_t::free_device_locked_tensor)
 
         .def("wait", &deepspeed_aio_handle_t::wait);
 }
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 9e55ae6cacf1..969afe39cee2 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -42,7 +42,7 @@ def pre_handle(args, tid, read_op):
 
     io_parallel = args.io_parallel if args.io_parallel else 1
     handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                not args.sequential_requests, gds,io_parallel)
+                                                not args.sequential_requests, gds, io_parallel)
     if gds:
         handle.new_device_locked_tensor(buffer)
     task_log(tid, f'created deepspeed aio handle')
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index 14fa0027e004..7c2cb46f83fc 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -99,7 +99,7 @@ for xtype in cpu gpu gds; do
                             OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
                             LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                             cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                    
+
                             echo ${DISABLE_CACHE}
                             echo ${cmd}
                             echo ${SYNC}
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
index 7f6a2430845a..dcbdc11c27ad 100644
--- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
@@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -695,7 +695,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -1211,7 +1211,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {};
     };
 
 private:
@@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {};
     };
 
 private:
diff --git a/csrc/aio/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
similarity index 54%
rename from csrc/aio/py_lib/deepspeed_gds_op.cpp
rename to csrc/gds/py_lib/deepspeed_gds_op.cpp
index 207477ef455a..b8af24f09070 100644
--- a/csrc/aio/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -11,39 +11,11 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 using namespace std;
 
-#ifdef __ENABLE_GDS__
-void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads)
-{
-    std::string depthStr = std::to_string(queue_depth);
-    std::string threadsStr = std::to_string(num_threads);
-    std::string json1 = R"({"execution": {"max_io_queue_depth": )"+depthStr+", ";
-    std::string json2 = R"("max_request_parallelism": )"+threadsStr+", ";
-    std::string json3 = R"("max_io_threads": )"+threadsStr+", ";
-    std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})";
-    std::ofstream outFile("local_cufile.json");
-    if (outFile.is_open()){
-        outFile << json1 + json2 + json3 + json4;
-        outFile.close();
-    } else { std::cerr<<"Can't open local cufile" << std::endl;exit(EXIT_FAILURE);}
-    putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
-    cuFileDriverOpen();
-    cudaCheckError();
-    size_t direct_io_size = (size_t)block_size / 1024;
-    CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size);
-    if (status.err != CU_FILE_SUCCESS) {
-        std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
-        exit(EXIT_FAILURE);
-    }
-};
-
-// TODO: deregister and release all buffers
-void close_gds() {cuFileDriverClose();}
-
 // For when there is more than 1 device
 // static std::set<char*> base_buffer_registry;
 static std::map<const int64_t, std::set<void*>> base_ptr_registry;
 
-void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle)
+static void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle)
 {
     memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t));
     cf_descr.handle.fd = fd;
@@ -56,108 +28,44 @@ void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t
     }
 }
 
-gds_op_desc_t::gds_op_desc_t(const bool read_op,
-                             const torch::Tensor& buffer,
-                             const int fd,
-                             const char* filename,
-                             const long long int file_num_bytes,
-                             const int num_threads,
-                             const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate)
+static void* _find_base_ptr(const int64_t device, char* buf_ptr)
 {
-    // assert(_buffer.is_cuda());
-    _contiguous_buffer = _buffer.contiguous();
-
-    const int64_t device = _buffer.get_device();
-
-    char * buf_ptr = (char *)_contiguous_buffer.data_ptr();
+    void* base_ptr = nullptr;
     int64_t last = -1;
     int64_t ptr_diff;
     for (const auto& value : base_ptr_registry[device]) {
-        ptr_diff = buf_ptr - (char *)value;
+        ptr_diff = buf_ptr - (char*)value;
         if (last == -1 && ptr_diff >= 0) {
             last = ptr_diff;
-            _base_ptr = value;
-        }
-        else if ( ptr_diff < last && ptr_diff >= 0) {
+            base_ptr = value;
+        } else if (ptr_diff < last && ptr_diff >= 0) {
             last = ptr_diff;
-            _base_ptr = value;
+            base_ptr = value;
         }
     }
-    if (_contiguous_buffer.data_ptr() < _base_ptr) {
-        std::cerr << "BASE PTR ERROR :" << _base_ptr << " BUF PTR " << _contiguous_buffer.data_ptr() << std::endl;
+    if (!base_ptr || buf_ptr < base_ptr) {
+        std::cerr << "BASE PTR ERROR :" << base_ptr << " BUF PTR " << (void*)buf_ptr << std::endl;
         for (const auto& value : base_ptr_registry[device]) {
-            std::cerr << "BASE PTR AVAIL :" << value  << std::endl;
+            std::cerr << "BASE PTR AVAIL :" << value << std::endl;
         }
         exit(EXIT_FAILURE);
     }
-    check_cudaruntimecall(cudaSetDevice(device));
-
-    _safe_handle_register(fd, _cf_descr, _cf_handle);
-
-}
-
-char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void gds_op_desc_t::fini()
-{
-    //check_cuFileCall(cuFileBufDeregister(_buffer.data_ptr()), "file buffer deregister");
-    cuFileHandleDeregister(_cf_handle);
-}
-
-void gds_op_desc_t::validate()
-{
-
-    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
-    const auto cpu_buffer = _buffer.to(torch::kCPU);
-    validate_aio_operation(
-        _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes);
-}
-
-void gds_op_desc_t::run(const int tid,
-                        std::unique_ptr<aio_context>& aio_ctxt,
-                        deepspeed_aio_config_t* aio_config)
-{
-    assert(tid < _num_threads);
-    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
-    int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char *)_base_ptr;
-    const auto file_offset = _num_bytes_per_thread * tid;
-
-    if (_read_op) {
-        auto ret = cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
-        if (ret < 0) { _report_error(ret, errno, buf_offset); }
-    } else {
-        auto ret = cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
-        if (ret < 0) { _report_error(ret, errno, buf_offset); }
-    }
-}
 
-void gds_op_desc_t::_report_error(const ssize_t return_code,
-                                  const int error_num,
-                                  const off_t offset)
-{
-    const auto op_string = _read_op ? "read failed with " : "write failed with ";
-    const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: ";
-    const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code)
-                                                       : cuFileGetErrorString(error_num);
-    std::cerr << op_string << error_string << error_code << " return code = " << return_code
-              << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread
-              << " offset = " << offset << std::endl;
-    exit(EXIT_FAILURE);
+    return base_ptr;
 }
 
-int register_buffer(const torch::Tensor& buffer)
+void gds_op_desc_t::add_buffer_to_registry(const torch::Tensor& buffer)
 {
     const int64_t device = buffer.get_device();
-    void * reg_ptr = buffer.data_ptr();
+    void* reg_ptr = buffer.data_ptr();
 
     // std::cout << "REG PTR " <<  reg_ptr << std::endl;
     // TODO: add checking to make sure pointer isn't already in set
     const auto it = base_ptr_registry.find(device);
     if (it == base_ptr_registry.end()) {
-        std::set<void *> new_ptr_set;
+        std::set<void*> new_ptr_set;
         new_ptr_set.insert(reg_ptr);
-        base_ptr_registry.insert(std::pair<const int64_t, std::set<void *>>(device, new_ptr_set));
+        base_ptr_registry.insert(std::pair<const int64_t, std::set<void*>>(device, new_ptr_set));
     } else {
         base_ptr_registry[device].insert(reg_ptr);
     }
@@ -168,13 +76,12 @@ int register_buffer(const torch::Tensor& buffer)
         std::cerr << "buffer register failed:" << cuFileGetErrorString(status) << std::endl;
         exit(EXIT_FAILURE);
     }
-    return 0;
 }
 
-int deregister_buffer(const torch::Tensor& buffer)
+void gds_op_desc_t::remove_buffer_from_registry(const torch::Tensor& buffer)
 {
     const int64_t device = buffer.get_device();
-    void * reg_ptr = buffer.data_ptr();
+    void* reg_ptr = buffer.data_ptr();
 
     // std::cout << "DEREG PTR " <<  reg_ptr << std::endl;
     check_cudaruntimecall(cudaSetDevice(device));
@@ -182,19 +89,8 @@ int deregister_buffer(const torch::Tensor& buffer)
 
     // Remove from tracked registry
     base_ptr_registry[device].erase(reg_ptr);
-    return 0;
 }
-#else
-void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads)
-{
-    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
-    exit(EXIT_FAILURE);
-};
-void close_gds()
-{
-    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
-    exit(EXIT_FAILURE);
-};
+
 gds_op_desc_t::gds_op_desc_t(const bool read_op,
                              const torch::Tensor& buffer,
                              const int fd,
@@ -204,17 +100,56 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
                              const bool validate)
     : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate)
 {
-    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
-    exit(EXIT_FAILURE);
-};
-int register_buffer(const torch::Tensor& buffer)
+    _contiguous_buffer = _buffer.contiguous();
+    const int64_t device = _buffer.get_device();
+    check_cudaruntimecall(cudaSetDevice(device));
+    _base_ptr = _find_base_ptr(device, (char*)_contiguous_buffer.data_ptr());
+
+    _safe_handle_register(fd, _cf_descr, _cf_handle);
+}
+
+char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void gds_op_desc_t::fini() { cuFileHandleDeregister(_cf_handle); }
+
+void gds_op_desc_t::validate()
 {
-    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
-    exit(EXIT_FAILURE);
-};
-int deregister_buffer(const torch::Tensor& buffer)
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    const auto cpu_buffer = _buffer.to(torch::kCPU);
+    validate_aio_operation(
+        _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes);
+}
+
+void gds_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
 {
-    std::cerr << "Library compiled without __ENABLE_GDS__"  << std::endl;
+    assert(tid < _num_threads);
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
+    const auto file_offset = _num_bytes_per_thread * tid;
+
+    if (_read_op) {
+        auto ret =
+            cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+    } else {
+        auto ret =
+            cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+    }
+}
+
+void gds_op_desc_t::_report_error(const ssize_t return_code,
+                                  const int error_num,
+                                  const off_t offset)
+{
+    const auto op_string = _read_op ? "read failed with " : "write failed with ";
+    const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: ";
+    const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code)
+                                                       : cuFileGetErrorString(error_num);
+    std::cerr << op_string << error_string << error_code << " return code = " << return_code
+              << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread
+              << " offset = " << offset << std::endl;
     exit(EXIT_FAILURE);
-};
-#endif
+}
diff --git a/csrc/aio/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
similarity index 68%
rename from csrc/aio/py_lib/deepspeed_gds_op.h
rename to csrc/gds/py_lib/deepspeed_gds_op.h
index 1e955aa67558..3ad8b9ecf58d 100644
--- a/csrc/aio/py_lib/deepspeed_gds_op.h
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -3,23 +3,19 @@
 
 // DeepSpeed Team
 
+#include <cstdlib>
+#include <fstream>
 #include <memory>
 #include <queue>
-#include <fstream>
-#include <string>
-#include <cstdlib>
 #include <set>
+#include <string>
 
 #include "deepspeed_aio_op_desc.h"
-#ifdef __ENABLE_GDS__
 #include "deepspeed_gds_utils.h"
-#endif
 
 struct gds_op_desc_t : io_op_desc_t {
-    #ifdef __ENABLE_GDS__
-        CUfileDescr_t _cf_descr;
-        CUfileHandle_t _cf_handle;
-    #endif
+    CUfileDescr_t _cf_descr;
+    CUfileHandle_t _cf_handle;
     void* _base_ptr;
 
     gds_op_desc_t(const bool read_op,
@@ -40,17 +36,9 @@ struct gds_op_desc_t : io_op_desc_t {
 
     void fini();
 
-    void _read_file(const int tid);
-
-    void _write_file(const int tid);
-
     void _report_error(const ssize_t return_code, const int error_num, const off_t offset);
-};
 
-int register_buffer(const torch::Tensor& buffer);
+    static void add_buffer_to_registry(const torch::Tensor& buffer);
 
-int deregister_buffer(const torch::Tensor& buffer);
-
-void init_gds_cufile(const int block_size, const int queue_depth, const int num_threads);
-
-void close_gds();
+    static void remove_buffer_from_registry(const torch::Tensor& buffer);
+};
diff --git a/csrc/aio/py_lib/deepspeed_gds_utils.h b/csrc/gds/py_lib/deepspeed_gds_utils.h
similarity index 100%
rename from csrc/aio/py_lib/deepspeed_gds_utils.h
rename to csrc/gds/py_lib/deepspeed_gds_utils.h
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
new file mode 100644
index 000000000000..859ca19535a4
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -0,0 +1,95 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+    GPUDirect Storage functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_gds_handle.h"
+#include <cstdlib>
+#include "deepspeed_gds_op.h"
+
+using namespace std;
+
+int deepspeed_gds_handle_t::s_cuFile_init = 0;
+
+deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int num_threads)
+    : deepspeed_aio_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
+{
+    _init_cuFile(block_size, queue_depth, num_threads);
+}
+
+deepspeed_gds_handle_t::~deepspeed_gds_handle_t() { _close_cuFile(); }
+
+void deepspeed_gds_handle_t::_init_cuFile(const int block_size,
+                                          const int queue_depth,
+                                          const int num_threads)
+{
+    if (deepspeed_gds_handle_t::s_cuFile_init == 0) {
+        std::string depthStr = std::to_string(queue_depth);
+        std::string threadsStr = std::to_string(num_threads);
+        std::string json1 = R"({"execution": {"max_io_queue_depth": )" + depthStr + ", ";
+        std::string json2 = R"("max_request_parallelism": )" + threadsStr + ", ";
+        std::string json3 = R"("max_io_threads": )" + threadsStr + ", ";
+        std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})";
+        std::ofstream outFile("local_cufile.json");
+        if (outFile.is_open()) {
+            outFile << json1 + json2 + json3 + json4;
+            outFile.close();
+        } else {
+            std::cerr << "Can't open local cufile" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+        putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
+        cuFileDriverOpen();
+        cudaCheckError();
+        size_t direct_io_size = (size_t)block_size / 1024;
+        CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size);
+        if (status.err != CU_FILE_SUCCESS) {
+            std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+    deepspeed_gds_handle_t::s_cuFile_init++;
+}
+
+void deepspeed_gds_handle_t::_close_cuFile()
+{
+    deepspeed_gds_handle_t::s_cuFile_init--;
+    if (deepspeed_gds_handle_t::s_cuFile_init == 0) { cuFileDriverClose(); }
+}
+
+int deepspeed_gds_handle_t::new_device_locked_tensor(const torch::Tensor& buffer)
+{
+    gds_op_desc_t::add_buffer_to_registry(buffer);
+    return 0;
+}
+
+int deepspeed_gds_handle_t::free_device_locked_tensor(const torch::Tensor& buffer)
+{
+    gds_op_desc_t::remove_buffer_from_registry(buffer);
+    return 0;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const int fd,
+    const char* filename,
+    const long long int file_num_bytes,
+    const int num_threads,
+    const bool validate)
+{
+    if (buffer.is_cuda()) {
+        return std::make_shared<gds_op_desc_t>(
+            read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+    }
+    return deepspeed_aio_handle_t::_create_io_op_desc(
+        read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+}
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
new file mode 100644
index 000000000000..0e42b07a49dc
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_py_aio_handle.h"
+
+struct deepspeed_gds_handle_t : deepspeed_aio_handle_t {
+    deepspeed_gds_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int num_threads);
+
+    ~deepspeed_gds_handle_t();
+
+    int new_device_locked_tensor(const torch::Tensor& buffer);
+
+    int free_device_locked_tensor(const torch::Tensor& buffer);
+
+    void _init_cuFile(const int block_size, const int queue_length, const int num_threads);
+
+    void _close_cuFile();
+
+    std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
+                                                            const torch::Tensor& buffer,
+                                                            const int fd,
+                                                            const char* filename,
+                                                            const long long int file_num_bytes,
+                                                            const int num_threads,
+                                                            const bool validate);
+
+    static int s_cuFile_init;
+};
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
new file mode 100644
index 000000000000..7d1c34e3bcad
--- /dev/null
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <torch/extension.h>
+#include "deepspeed_py_gds_handle.h"
+using namespace pybind11::literals;
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    py::class_<deepspeed_gds_handle_t>(m, "gds_handle")
+        .def(py::init<const int, const int, const bool, const bool, const int>(),
+             "GDS handle constructor",
+             "block_size"_a,
+             "queue_depth"_a,
+             "single_submit"_a,
+             "overlap_events"_a,
+             "num_threads"_a)
+
+        .def("get_block_size", &deepspeed_gds_handle_t::get_block_size)
+        .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth)
+        .def("get_single_submit", &deepspeed_gds_handle_t::get_single_submit)
+        .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events)
+        .def("get_thread_count", &deepspeed_gds_handle_t::get_thread_count)
+
+        .def("read", &deepspeed_gds_handle_t::read)
+        .def("write", &deepspeed_gds_handle_t::write)
+
+        .def("pread", &deepspeed_gds_handle_t::pread)
+        .def("pwrite", &deepspeed_gds_handle_t::pwrite)
+
+        .def("sync_pread", &deepspeed_gds_handle_t::sync_pread)
+        .def("sync_pwrite", &deepspeed_gds_handle_t::sync_pwrite)
+        .def("async_pread", &deepspeed_gds_handle_t::async_pread)
+        .def("async_pwrite", &deepspeed_gds_handle_t::async_pwrite)
+
+        .def("new_cpu_locked_tensor", &deepspeed_gds_handle_t::new_cpu_locked_tensor)
+        .def("free_cpu_locked_tensor", &deepspeed_gds_handle_t::free_cpu_locked_tensor)
+        .def("new_device_locked_tensor", &deepspeed_gds_handle_t::new_device_locked_tensor)
+        .def("free_device_locked_tensor", &deepspeed_gds_handle_t::free_device_locked_tensor)
+
+        .def("wait", &deepspeed_gds_handle_t::wait);
+}
diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py
new file mode 100644
index 000000000000..b34b1194f582
--- /dev/null
+++ b/csrc/gds/py_test/validate_gds.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+from deepspeed.ops.op_builder import GDSBuilder
+assert GDSBuilder().is_compatible(True)
+assert GDSBuilder().load(True)
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index f5bfb45dd2e2..a205026ec7c1 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val)
     std::memcpy(dst, &val, sizeof(T));
 }
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h
old mode 100755
new mode 100644
index f77568be7835..097e2d8585cc
--- a/csrc/xpu/includes/simd.h
+++ b/csrc/xpu/includes/simd.h
@@ -13,7 +13,7 @@
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h
index fa41757c895b..1897afd1fea2 100644
--- a/csrc/xpu/includes/type_shim.h
+++ b/csrc/xpu/includes/type_shim.h
@@ -82,11 +82,11 @@
     }
 
 template <typename T>
-__inline__ __attribute__((always_inline)) T reduce_block_into_lanes(
-    T* x,
-    T val,
-    int lanes = 1,
-    bool share_result = false)  // lanes is intended to be <= 32.
+__inline__ __attribute__((always_inline)) T
+reduce_block_into_lanes(T* x,
+                        T val,
+                        int lanes = 1,
+                        bool share_result = false)  // lanes is intended to be <= 32.
 {
     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
     int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);
diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
index 53b0c8072923..039b999dfeca 100644
--- a/deepspeed/elasticity/elastic_agent.py
+++ b/deepspeed/elasticity/elastic_agent.py
@@ -28,6 +28,7 @@
 
 log = get_logger(__name__)
 
+
 class DSElasticAgent(LocalElasticAgent):
 
     def __init__(
diff --git a/deepspeed/ops/gds/__init__.py b/deepspeed/ops/gds/__init__.py
new file mode 100755
index 000000000000..3c0762c81076
--- /dev/null
+++ b/deepspeed/ops/gds/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..op_builder import GDSBuilder
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
index df4a38380089..46c3f2a0c954 100644
--- a/deepspeed/runtime/swap_tensor/aio_config.py
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -5,25 +5,33 @@
 
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.accelerator import get_accelerator
 
 AIO_DEFAULT_DICT = {
     AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT,
     AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT,
     AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT,
     AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT,
-    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT
+    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT,
+    AIO_USE_GDS: AIO_USE_GDS_DEFAULT
 }
 
 
 def get_aio_config(param_dict):
     if AIO in param_dict.keys() and param_dict[AIO] is not None:
         aio_dict = param_dict[AIO]
-        return {
+        aio_config = {
             AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
             AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
             AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT),
             AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
-            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT)
+            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT),
+            AIO_USE_GDS: get_scalar_param(aio_dict, AIO_USE_GDS, AIO_USE_GDS_DEFAULT)
         }
 
+        if aio_config[AIO_USE_GDS]:
+            assert get_accelerator().device_name() == 'cuda', 'GDS currently only supported for CUDA accelerator'
+
+        return aio_config
+
     return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
index 4c9722bc4e4f..cee20ac7b78c 100644
--- a/deepspeed/runtime/swap_tensor/constants.py
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -11,7 +11,8 @@
   "queue_depth": 8,
   "thread_count": 1,
   "single_submit": false,
-  "overlap_events": true
+  "overlap_events": true,
+  "use_gds": false
 }
 '''
 AIO = "aio"
@@ -25,3 +26,5 @@
 AIO_SINGLE_SUBMIT_DEFAULT = False
 AIO_OVERLAP_EVENTS = "overlap_events"
 AIO_OVERLAP_EVENTS_DEFAULT = True
+AIO_USE_GDS = "use_gds"
+AIO_USE_GDS_DEFAULT = False
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index 6f09a687d98c..120723fae5ab 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -13,6 +13,7 @@
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
 from .constants import *
 from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, AIO_ALIGNED_BYTES, print_object, SwapBufferPool
 
@@ -37,9 +38,6 @@ class AsyncPartitionedParameterSwapper(object):
 
     def __init__(self, ds_config, model_dtype):
 
-        aio_op = AsyncIOBuilder().load(verbose=False)
-        self.aio_handle = aio_op.aio_handle
-        self.use_gds = True
         self.dtype = model_dtype
 
         #set swap buffers, create aio handles
@@ -94,6 +92,10 @@ def _configure_aio(self, ds_config):
 
         self.aio_config = ds_config.aio_config
 
+        self.use_gds = self.aio_config[AIO_USE_GDS]
+        self.aio_handle = GDSBuilder().load(verbose=False).gds_handle if self.use_gds else AsyncIOBuilder().load(
+            verbose=False).aio_handle
+
         # Read/Write alignment for each thread during Intra-request parallelism
         self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE])
         self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_THREAD_COUNT]
@@ -107,18 +109,17 @@ def _configure_aio(self, ds_config):
         self.reserved_buffer_ids = []
 
         self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
-                                               self.aio_config[AIO_SINGLE_SUBMIT], 
-                                               self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT])
+                                               self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS],
+                                               self.aio_config[AIO_THREAD_COUNT])
 
         self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
                                                 self.aio_config[AIO_SINGLE_SUBMIT],
-                                                self.aio_config[AIO_OVERLAP_EVENTS], self.use_gds, self.aio_config[AIO_THREAD_COUNT])
+                                                self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT])
 
         if self.use_gds:
-            self.buffers = torch.empty(int(self.aligned_elements_per_buffer *
-                                           self.param_buffer_count),
+            self.buffers = torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count),
                                        dtype=self.dtype,
-                                       device='cuda', # gotta be cuda
+                                       device=get_accelerator().device_name(),
                                        requires_grad=False)
             self.aio_read_handle.new_device_locked_tensor(self.buffers)
         else:
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index a8620387d209..a9039def3a40 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -20,15 +20,18 @@ def __init__(self):
     def absolute_name(self):
         return f'deepspeed.ops.aio.{self.NAME}_op'
 
-    def sources(self):
-        return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
+    def lib_sources(self):
+        src_list = [
             'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
             'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
             'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
-            'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_gds_op.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+            'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp',
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
+        return src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp']
 
     def include_paths(self):
         import torch
@@ -47,21 +50,9 @@ def include_paths(self):
     def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
         args = super().cxx_args()
-        GDS_ENABLE = self.is_gds_enable()
-        args += [
-            '-Wall',
-            '-O0',
-            '-shared',
-            '-fPIC',
-            '-Wno-reorder',
-            GDS_ENABLE
-        ]
-
+        args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder']
         return args
 
-    def is_gds_enable(self):
-        return '-D__ENABLE_GDS__'
-
     def extra_ldflags(self):
         if self.build_for_cpu:
             return ['-fopenmp']
@@ -69,7 +60,8 @@ def extra_ldflags(self):
         import torch.utils.cpp_extension
         CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
         CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
-        return [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart', '-lcufile']
+        ldflags = [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart']
+        return ldflags
 
     def check_for_libaio_pkg(self):
         libs = dict(
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 03611bf56284..cc87e962c853 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -284,7 +284,7 @@ def is_compatible(self, verbose=True):
     def extra_ldflags(self):
         return []
 
-    def has_function(self, funcname, libraries, verbose=False):
+    def has_function(self, funcname, libraries, library_dirs=None, verbose=False):
         '''
         Test for existence of a function within a tuple of libraries.
 
@@ -340,7 +340,8 @@ def has_function(self, funcname, libraries, verbose=False):
             compiler.link_executable(objs,
                                      os.path.join(tempdir, 'a.out'),
                                      extra_preargs=self.strip_empty_entries(ldflags),
-                                     libraries=libraries)
+                                     libraries=libraries,
+                                     library_dirs=library_dirs)
 
             # Compile and link succeeded
             return True
diff --git a/op_builder/gds.py b/op_builder/gds.py
new file mode 100644
index 000000000000..3b06ca16f40d
--- /dev/null
+++ b/op_builder/gds.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+from .async_io import AsyncIOBuilder
+
+
+class GDSBuilder(AsyncIOBuilder):
+    BUILD_VAR = "DS_BUILD_GDS"
+    NAME = "gds"
+
+    def __init__(self):
+        super().__init__()
+
+    def absolute_name(self):
+        return f'deepspeed.ops.gds.{self.NAME}_op'
+
+    def lib_sources(self):
+        src_list = ['csrc/gds/py_lib/deepspeed_py_gds_handle.cpp', 'csrc/gds/py_lib/deepspeed_gds_op.cpp']
+        return super().lib_sources() + src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/gds/py_lib/py_ds_gds.cpp']
+
+    def cxx_args(self):
+        return super().cxx_args() + ['-lcufile']
+
+    def include_paths(self):
+        import torch
+        CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE
+
+    def extra_ldflags(self):
+        return super().extra_ldflags() + ['-lcufile']
+
+    def is_compatible(self, verbose=True):
+        import torch.utils.cpp_extension
+        CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
+        CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
+        gds_compatible = self.has_function(funcname="cuFileDriverOpen",
+                                           libraries=("cufile", ),
+                                           library_dirs=(
+                                               CUDA_HOME,
+                                               CUDA_LIB64,
+                                           ),
+                                           verbose=verbose)
+
+        return gds_compatible and super().is_compatible(verbose)
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index eb6ddd4da8cb..e6927efc3824 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -13,26 +13,22 @@
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from unit.common import DistributedTest
 
-KILO_BYTE = 1024*256
+KILO_BYTE = 1024
 BLOCK_SIZE = KILO_BYTE
 QUEUE_DEPTH = 2
 IO_SIZE = 4 * BLOCK_SIZE
 IO_PARALLEL = 2
-GDS_ENABLE=True
 
 if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
     pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True)
 
 
-def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True, use_gds=False):
+def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True):
     if not get_accelerator().is_available():
         if use_cuda_device:
             pytest.skip("GPU tensors only supported in CUDA environments.")
         if use_cuda_pinned_tensor:
             pytest.skip("CUDA-pinned tensors only supported in CUDA environments.")
-    if not GDS_ENABLE and use_gds:
-        pytest.skip("GDS not available, won't run GDS case.")
-
 
 
 def _get_local_rank():
@@ -62,6 +58,7 @@ def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
     return test_file, test_buffer
 
 
+def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     if aio_handle is None:
         test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer)))
@@ -73,19 +70,17 @@ def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
     return test_file, test_buffer
 
 
-def _validate_handle_state(handle, single_submit, overlap_events, use_gds):
+def _validate_handle_state(handle, single_submit, overlap_events):
     assert handle.get_single_submit() == single_submit
     assert handle.get_overlap_events() == overlap_events
-    if use_gds:
-        assert handle.get_thread_count() == 1
-    else:
-        assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_thread_count() == IO_PARALLEL
     assert handle.get_block_size() == BLOCK_SIZE
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
-@pytest.mark.parametrize("single_submit", [True,False])
-@pytest.mark.parametrize("overlap_events", [True,False])
-@pytest.mark.parametrize("use_cuda_pinned_tensor, use_gds", [(False,False),(True,False),(False,True)])
+
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
 class TestRead(DistributedTest):
     world_size = 1
     reuse_dist_env = True
@@ -94,20 +89,17 @@ class TestRead(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_read(self, tmpdir, single_submit, overlap_events, use_cuda_pinned_tensor, use_gds):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
+    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
-        elif use_gds:
-            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
-            h.new_device_locked_tensor(aio_buffer)
         else:
             aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
 
-        _validate_handle_state(h, single_submit, overlap_events, use_gds)
+        _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
         read_status = h.sync_pread(aio_buffer, ref_file)
@@ -117,17 +109,15 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events, use_cuda_pin
             ref_buffer = list(f.read())
         assert ref_buffer == aio_buffer.tolist()
 
-        if use_gds:
-            h.free_device_locked_tensor(aio_buffer)
-        elif not use_cuda_pinned_tensor:
+        if not use_cuda_pinned_tensor:
             h.free_cpu_locked_tensor(aio_buffer)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
+    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         use_cpu_locked_tensor = False
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if cuda_device:
             aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
@@ -154,10 +144,9 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
             h.free_cpu_locked_tensor(aio_buffer)
 
 
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
-@pytest.mark.parametrize("use_gds", [False])
 class TestWrite(DistributedTest):
     world_size = 1
     reuse_dist_env = True
@@ -166,11 +155,11 @@ class TestWrite(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
+    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
             aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
@@ -191,12 +180,12 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove
         assert filecmp.cmp(ref_file, aio_file, shallow=False)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_gds, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
+    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
         use_cpu_locked_tensor = False
         if cuda_device:
             aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
@@ -224,9 +213,8 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 
 
 @pytest.mark.sequential
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
 @pytest.mark.parametrize("cuda_device", [True, False])
-@pytest.mark.parametrize("use_gds", [False])
 class TestAsyncQueue(DistributedTest):
     world_size = 1
     requires_cuda_env = False
@@ -235,8 +223,8 @@ class TestAsyncQueue(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, use_gds):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
+    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         for i in range(async_queue):
@@ -245,7 +233,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, us
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, use_gds, IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         use_cpu_locked_tensor = False
         if cuda_device:
@@ -282,8 +270,8 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device, us
                 h.free_cpu_locked_tensor(t)
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device, use_gds):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor, use_gds=use_gds)
+    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         ref_buffers = []
@@ -294,7 +282,7 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device, u
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, useg_gds,IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         aio_files = []
         aio_buffers = []
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
new file mode 100644
index 000000000000..7afa5970d69f
--- /dev/null
+++ b/tests/unit/ops/aio/test_gds.py
@@ -0,0 +1,244 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import os
+import filecmp
+import torch
+import deepspeed
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import GDSBuilder
+from unit.common import DistributedTest
+
+KILO_BYTE = 1024 * 256
+BLOCK_SIZE = KILO_BYTE
+QUEUE_DEPTH = 2
+IO_SIZE = 4 * BLOCK_SIZE
+IO_PARALLEL = 2
+
+if not deepspeed.ops.__compatible_ops__[GDSBuilder.NAME]:
+    pytest.skip('Skip tests since gds is not compatible', allow_module_level=True)
+
+
+def _get_local_rank():
+    if get_accelerator().is_available():
+        return dist.get_rank()
+    return 0
+
+
+def _do_ref_write(tmpdir, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
+    ref_buffer = os.urandom(IO_SIZE)
+    with open(ref_file, 'wb') as f:
+        f.write(ref_buffer)
+
+    return ref_file, ref_buffer
+
+
+def _get_test_write_file(tmpdir, index):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'_gds_write_random_{file_suffix}.pt')
+
+
+def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index=0):
+    test_file = _get_test_write_file(tmpdir, index)
+    test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
+    gds_handle.new_device_locked_tensor(test_buffer)
+    return test_file, test_buffer
+
+
+def _validate_handle_state(handle, single_submit, overlap_events):
+    assert handle.get_single_submit() == single_submit
+    assert handle.get_overlap_events() == overlap_events
+    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_block_size() == BLOCK_SIZE
+    assert handle.get_queue_depth() == QUEUE_DEPTH
+
+
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestRead(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_read(self, tmpdir, single_submit, overlap_events):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.new_device_locked_tensor(gds_buffer)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.sync_pread(gds_buffer, ref_file)
+        assert read_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == gds_buffer.tolist()
+
+        h.free_device_locked_tensor(gds_buffer)
+
+    def test_async_read(self, tmpdir, single_submit, overlap_events):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.new_device_locked_tensor(gds_buffer)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.async_pread(gds_buffer, ref_file)
+        assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == gds_buffer.tolist()
+
+        h.free_device_locked_tensor(gds_buffer)
+
+
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestWrite(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_write(self, tmpdir, single_submit, overlap_events):
+
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.sync_pwrite(gds_buffer, gds_file)
+        assert write_status == 1
+
+        h.free_device_locked_tensor(gds_buffer)
+
+        assert os.path.isfile(gds_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, gds_file, shallow=False)
+
+    def test_async_write(self, tmpdir, single_submit, overlap_events):
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.async_pwrite(gds_buffer, gds_file)
+        assert write_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        h.free_device_locked_tensor(gds_buffer)
+
+        assert os.path.isfile(gds_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, gds_file, shallow=False)
+
+
+@pytest.mark.sequential
+class TestAsyncQueue(DistributedTest):
+    world_size = 1
+    requires_cuda_env = False
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_read(self, tmpdir, async_queue):
+
+        ref_files = []
+        for i in range(async_queue):
+            f, _ = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+
+        single_submit = True
+        overlap_events = True
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffers = [
+            torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) for _ in range(async_queue)
+        ]
+        for buf in gds_buffers:
+            h.new_device_locked_tensor(buf)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pread(gds_buffers[i], ref_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for i in range(async_queue):
+            with open(ref_files[i], 'rb') as f:
+                ref_buffer = list(f.read())
+            assert ref_buffer == gds_buffers[i].tolist()
+
+        for t in gds_buffers:
+            h.free_device_locked_tensor(t)
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_write(self, tmpdir, async_queue):
+        ref_files = []
+        ref_buffers = []
+        for i in range(async_queue):
+            f, buf = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+            ref_buffers.append(buf)
+
+        single_submit = True
+        overlap_events = True
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_files = []
+        gds_buffers = []
+        for i in range(async_queue):
+            f, buf = _get_test_write_file_and_device_buffer(tmpdir, ref_buffers[i], h, i)
+            gds_files.append(f)
+            gds_buffers.append(buf)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pwrite(gds_buffers[i], gds_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for t in gds_buffers:
+            h.free_device_locked_tensor(t)
+
+        for i in range(async_queue):
+            assert os.path.isfile(gds_files[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_files[i], gds_files[i], shallow=False)

From 90baebac62e382c7ce855d2ab80471445559b5c9 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 6 Aug 2024 15:04:34 -0700
Subject: [PATCH 14/31] Formatting fix

---
 .../predicated_tile_access_iterator_residual_last.h       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
index dcbdc11c27ad..7f6a2430845a 100644
--- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
@@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -695,7 +695,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -1211,7 +1211,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {};
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
     };
 
 private:
@@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {};
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
     };
 
 private:

From bc4c5998ed860842ebe33991c62aec84c2f406f9 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 6 Aug 2024 15:21:53 -0700
Subject: [PATCH 15/31] Disable build GDS in pre compile ops

---
 .github/workflows/nv-pre-compile-ops.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
index a506bb27fda4..72ba8abbd95d 100644
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -36,7 +36,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
         - name: DS Report
           run: |
              ds_report

From 759d9f8a2aef06ec64fd1a2d7a64d2b8cc1aab32 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Wed, 7 Aug 2024 22:45:48 +0000
Subject: [PATCH 16/31] updating microbenchmark script

---
 csrc/aio/py_test/ds_aio_handle.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 969afe39cee2..a9c5a9d207d7 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -11,6 +11,7 @@
 import time
 from multiprocessing import Pool, Barrier
 from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
 from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
 from deepspeed.accelerator import get_accelerator
 
@@ -41,10 +42,11 @@ def pre_handle(args, tid, read_op):
              force=True)
 
     io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                not args.sequential_requests, gds, io_parallel)
     if gds:
+        handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel)
         handle.new_device_locked_tensor(buffer)
+    else:
+        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel)
     task_log(tid, f'created deepspeed aio handle')
 
     ctxt = {}

From d56ab675c216dc7f7c66621bf9047c87c8b1af29 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 9 Aug 2024 16:26:22 -0400
Subject: [PATCH 17/31] Fix formatting

---
 csrc/aio/common/deepspeed_aio_common.cpp               |  5 +++--
 csrc/aio/py_lib/deepspeed_py_aio.cpp                   | 10 ++++++----
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp            | 10 ++++++----
 csrc/aio/py_lib/deepspeed_py_copy.cpp                  |  2 +-
 csrc/aio/py_test/ds_aio_handle.py                      |  6 ++++--
 .../evoformer_attn/gemm_kernel_utils.h                 |  9 +++++----
 csrc/includes/simd.h                                   |  2 +-
 csrc/xpu/includes/simd.h                               |  2 +-
 csrc/xpu/includes/type_shim.h                          | 10 +++++-----
 9 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index a65cc500cc82..0f2895dfa328 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -301,8 +301,9 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     } while (r > 0);
 
     if (read_bytes != num_bytes) {
-        std::cerr << "read error " << " read_bytes (read) = " << read_bytes
-                  << " num_bytes (fstat) = " << num_bytes << std::endl;
+        std::cerr << "read error "
+                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
+                  << std::endl;
     }
     assert(read_bytes == num_bytes);
     close(fd);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index eac268d33433..30b6682ada72 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -69,8 +69,9 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
 
@@ -114,7 +115,8 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index 284c84c721f5..a6a68ee1a1d0 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -91,8 +91,9 @@ int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, co
     if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
 
@@ -125,8 +126,9 @@ int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
 
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index f5480e9d9d83..561c46f7c287 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_copy.h"
 #include <omp.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 
 #if defined(__AVX512__) or defined(__AVX256__)
 union AVX_Data {
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index a9c5a9d207d7..881cbb7f2b8a 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -43,10 +43,12 @@ def pre_handle(args, tid, read_op):
 
     io_parallel = args.io_parallel if args.io_parallel else 1
     if gds:
-        handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel)
+        handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
         handle.new_device_locked_tensor(buffer)
     else:
-        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel)
+        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                    not args.sequential_requests, io_parallel)
     task_log(tid, f'created deepspeed aio handle')
 
     ctxt = {}
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
index c102234a4dfb..2a4300c5cac1 100644
--- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -125,10 +125,11 @@ struct CheckArch {
         std::cerr << #PTR " is not correctly aligned\n"; \
         return false;                                    \
     }
-#define EVOFORMER_CHECK(COND, ERR)                                                     \
-    if (!(COND)) {                                                                     \
-        std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \
-        return false;                                                                  \
+#define EVOFORMER_CHECK(COND, ERR)                          \
+    if (!(COND)) {                                          \
+        std::cerr << "[Evoformer Attention]"                \
+                  << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                       \
     }
 #endif
 
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index a205026ec7c1..f5bfb45dd2e2 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val)
     std::memcpy(dst, &val, sizeof(T));
 }
 
-#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h
index 097e2d8585cc..f77568be7835 100644
--- a/csrc/xpu/includes/simd.h
+++ b/csrc/xpu/includes/simd.h
@@ -13,7 +13,7 @@
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
 
-#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h
index 1897afd1fea2..fa41757c895b 100644
--- a/csrc/xpu/includes/type_shim.h
+++ b/csrc/xpu/includes/type_shim.h
@@ -82,11 +82,11 @@
     }
 
 template <typename T>
-__inline__ __attribute__((always_inline)) T
-reduce_block_into_lanes(T* x,
-                        T val,
-                        int lanes = 1,
-                        bool share_result = false)  // lanes is intended to be <= 32.
+__inline__ __attribute__((always_inline)) T reduce_block_into_lanes(
+    T* x,
+    T val,
+    int lanes = 1,
+    bool share_result = false)  // lanes is intended to be <= 32.
 {
     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
     int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);

From 1e2082581cbc160489b756b00af5418e39b2a57c Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 9 Aug 2024 19:14:25 -0400
Subject: [PATCH 18/31] Formatting and Typo fixes

---
 csrc/aio/py_lib/deepspeed_aio_op_desc.cpp   | 2 +-
 csrc/aio/py_lib/deepspeed_aio_op_desc.h     | 2 +-
 csrc/aio/py_lib/deepspeed_cpu_op.cpp        | 2 +-
 csrc/aio/py_lib/deepspeed_cpu_op.h          | 2 +-
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 4 ++--
 csrc/aio/py_lib/deepspeed_py_aio_handle.h   | 2 +-
 csrc/aio/py_test/ds_aio_basic.py            | 4 ++--
 csrc/aio/py_test/ds_aio_handle.py           | 4 ++--
 csrc/gds/py_lib/deepspeed_gds_op.cpp        | 3 +--
 csrc/gds/py_lib/deepspeed_gds_op.h          | 2 +-
 10 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
index 5c9bb033c0c2..dc820be528d0 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -27,7 +27,7 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
 
 char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
 
-void io_op_desc_t::fini() {}
+void io_op_desc_t::finish() {}
 
 void io_op_desc_t::validate() {}
 
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
index c5bffae10265..7305f6920c91 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -36,6 +36,6 @@ struct io_op_desc_t {
 
     virtual void validate();
 
-    virtual void fini();
+    virtual void finish();
 };
 #endif  // _IO_OP_DESC_T_
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index 767ad5d905e0..b3b1932bd70b 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -35,7 +35,7 @@ cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
 
 char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
 
-void cpu_op_desc_t::fini()
+void cpu_op_desc_t::finish()
 {
     if (_read_op) {
         if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
index 07a4369674fc..da96dd2b1d50 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.h
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -27,5 +27,5 @@ struct cpu_op_desc_t : io_op_desc_t {
 
     void validate();
 
-    void fini();
+    void finish();
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index a6a68ee1a1d0..8f2c82f86968 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -179,7 +179,7 @@ int deepspeed_aio_handle_t::wait()
 
         if (completed_op->_validate) { completed_op->validate(); }
 
-        completed_op->fini();
+        completed_op->finish();
 
         close(completed_op->_fd);
 
@@ -195,7 +195,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
 {
     const auto op_string = read_op ? "Read" : "Write";
     if (num_bytes % get_thread_count()) {
-        std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
                   << " not divisible by thread count = " << get_thread_count() << std::endl;
         return false;
     }
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 180d9aba2f9c..c89998974743 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -16,7 +16,7 @@ struct deepspeed_aio_handle_t {
     std::unique_ptr<struct aio_context> _aio_ctxt;
     const bool _single_submit;
     const bool _overlap_events;
-    int _num_threads;
+    const int _num_threads;
     deepspeed_aio_config_t _aio_config;
 
     std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
index e90886e17871..9b3c7cbfc49f 100755
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -119,7 +119,7 @@ def _aio_handle_tasklet(pool_params):
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
 
-def _init_takslet(b):
+def _init_tasklet(b):
     global aio_barrier
     aio_barrier = b
 
@@ -128,7 +128,7 @@ def aio_basic_multiprocessing(args, read_op):
     num_processes = len(args.mapping_dict)
     b = Barrier(num_processes)
     pool_params = [(args, p, read_op) for p in range(num_processes)]
-    with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p:
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 881cbb7f2b8a..9480b20398d6 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -207,7 +207,7 @@ def _aio_handle_tasklet(pool_params):
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
 
-def _init_takslet(b):
+def _init_tasklet(b):
     global aio_barrier
     aio_barrier = b
 
@@ -216,7 +216,7 @@ def aio_handle_multiprocessing(args, read_op):
     num_processes = len(args.mapping_dict)
     b = Barrier(num_processes)
     pool_params = [(args, p, read_op) for p in range(num_processes)]
-    with Pool(processes=num_processes, initializer=_init_takslet, initargs=(b, )) as p:
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
index b8af24f09070..c370a448e5a2 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -12,7 +12,6 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 using namespace std;
 
 // For when there is more than 1 device
-// static std::set<char*> base_buffer_registry;
 static std::map<const int64_t, std::set<void*>> base_ptr_registry;
 
 static void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle)
@@ -110,7 +109,7 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
 
 char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
 
-void gds_op_desc_t::fini() { cuFileHandleDeregister(_cf_handle); }
+void gds_op_desc_t::finish() { cuFileHandleDeregister(_cf_handle); }
 
 void gds_op_desc_t::validate()
 {
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
index 3ad8b9ecf58d..b7fab64d4054 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.h
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -34,7 +34,7 @@ struct gds_op_desc_t : io_op_desc_t {
 
     void validate();
 
-    void fini();
+    void finish();
 
     void _report_error(const ssize_t return_code, const int error_num, const off_t offset);
 

From f6c6f66ea0d9e939552e96cd1169846eefc20795 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Sun, 11 Aug 2024 20:15:39 +0000
Subject: [PATCH 19/31] Fix GDS issues

---
 csrc/aio/py_lib/deepspeed_cpu_op.cpp          |  7 +++
 csrc/aio/py_lib/deepspeed_py_copy.cpp         |  2 +-
 ...cated_tile_access_iterator_residual_last.h |  8 +--
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp   | 28 ++++++++--
 csrc/gds/py_lib/deepspeed_py_gds_handle.h     |  9 +++-
 csrc/gds/py_lib/py_ds_gds.cpp                 |  7 ++-
 csrc/gds/py_test/validate_gds.py              |  7 ++-
 csrc/includes/simd.h                          |  2 +-
 csrc/xpu/includes/simd.h                      |  2 +-
 csrc/xpu/includes/type_shim.h                 | 10 ++--
 op_builder/async_io.py                        |  2 +-
 op_builder/gds.py                             |  2 +-
 tests/unit/ops/aio/test_gds.py                | 52 ++++++++++++++-----
 13 files changed, 101 insertions(+), 37 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index b3b1932bd70b..41790b99bb88 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -39,6 +39,13 @@ void cpu_op_desc_t::finish()
 {
     if (_read_op) {
         if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+        if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+#if defined(__ENABLE_CANN__)
+        if (torch_npu::utils::is_npu(_buffer)) {
+            auto device = at::Device("npu:0");
+            _buffer.copy_(_cpu_buffer.to(device));
+        }
+#endif
     }
 }
 
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index 561c46f7c287..f5480e9d9d83 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_copy.h"
 #include <omp.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__) or defined(__AVX256__)
 union AVX_Data {
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
index 7f6a2430845a..dcbdc11c27ad 100644
--- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
@@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -695,7 +695,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -1211,7 +1211,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {};
     };
 
 private:
@@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {};
     };
 
 private:
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index 859ca19535a4..94b89afb6941 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -46,6 +46,11 @@ void deepspeed_gds_handle_t::_init_cuFile(const int block_size,
             std::cerr << "Can't open local cufile" << std::endl;
             exit(EXIT_FAILURE);
         }
+        // TODO: Address the following issues with this code
+        // (1) Fix C++14 warning
+        // (2) Create file in a different location than PWD
+        // (3) Handle multi-GPU/multi-rank scenarios: should cufile be shared, is per-rank cufile
+        // safe?
         putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
         cuFileDriverOpen();
         cudaCheckError();
@@ -65,16 +70,31 @@ void deepspeed_gds_handle_t::_close_cuFile()
     if (deepspeed_gds_handle_t::s_cuFile_init == 0) { cuFileDriverClose(); }
 }
 
-int deepspeed_gds_handle_t::new_device_locked_tensor(const torch::Tensor& buffer)
+torch::Tensor deepspeed_gds_handle_t::new_pinned_device_tensor(const size_t num_elem,
+                                                               const torch::Tensor& example_tensor)
+{
+    auto options = torch::TensorOptions().dtype(example_tensor.scalar_type()).device(torch::kCUDA);
+    auto dev_tensor = torch::empty(num_elem, options);
+    pin_device_tensor(dev_tensor);
+    return dev_tensor;
+}
+
+bool deepspeed_gds_handle_t::free_pinned_device_tensor(torch::Tensor& buffer)
+{
+    unpin_device_tensor(buffer);
+    return true;
+}
+
+bool deepspeed_gds_handle_t::pin_device_tensor(const torch::Tensor& buffer)
 {
     gds_op_desc_t::add_buffer_to_registry(buffer);
-    return 0;
+    return true;
 }
 
-int deepspeed_gds_handle_t::free_device_locked_tensor(const torch::Tensor& buffer)
+bool deepspeed_gds_handle_t::unpin_device_tensor(const torch::Tensor& buffer)
 {
     gds_op_desc_t::remove_buffer_from_registry(buffer);
-    return 0;
+    return true;
 }
 
 std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index 0e42b07a49dc..e04f3b54da37 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -20,9 +20,14 @@ struct deepspeed_gds_handle_t : deepspeed_aio_handle_t {
 
     ~deepspeed_gds_handle_t();
 
-    int new_device_locked_tensor(const torch::Tensor& buffer);
+    torch::Tensor new_pinned_device_tensor(const size_t num_elem,
+                                           const torch::Tensor& example_tensor);
 
-    int free_device_locked_tensor(const torch::Tensor& buffer);
+    bool free_pinned_device_tensor(torch::Tensor&);
+
+    bool pin_device_tensor(const torch::Tensor& buffer);
+
+    bool unpin_device_tensor(const torch::Tensor& buffer);
 
     void _init_cuFile(const int block_size, const int queue_length, const int num_threads);
 
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
index 7d1c34e3bcad..10a7da1535ed 100644
--- a/csrc/gds/py_lib/py_ds_gds.cpp
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -41,8 +41,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 
         .def("new_cpu_locked_tensor", &deepspeed_gds_handle_t::new_cpu_locked_tensor)
         .def("free_cpu_locked_tensor", &deepspeed_gds_handle_t::free_cpu_locked_tensor)
-        .def("new_device_locked_tensor", &deepspeed_gds_handle_t::new_device_locked_tensor)
-        .def("free_device_locked_tensor", &deepspeed_gds_handle_t::free_device_locked_tensor)
+
+        .def("new_pinned_device_tensor", &deepspeed_gds_handle_t::new_pinned_device_tensor)
+        .def("free_pinned_device_tensor", &deepspeed_gds_handle_t::free_pinned_device_tensor)
+        .def("pin_device_tensor", &deepspeed_gds_handle_t::pin_device_tensor)
+        .def("unpin_device_tensor", &deepspeed_gds_handle_t::unpin_device_tensor)
 
         .def("wait", &deepspeed_gds_handle_t::wait);
 }
diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py
index b34b1194f582..ea306f287ae6 100644
--- a/csrc/gds/py_test/validate_gds.py
+++ b/csrc/gds/py_test/validate_gds.py
@@ -6,5 +6,8 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 from deepspeed.ops.op_builder import GDSBuilder
-assert GDSBuilder().is_compatible(True)
-assert GDSBuilder().load(True)
+import pdb
+
+pdb.set_trace()
+assert GDSBuilder().is_compatible(False)
+# assert GDSBuilder().load(True)
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index f5bfb45dd2e2..a205026ec7c1 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val)
     std::memcpy(dst, &val, sizeof(T));
 }
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h
index f77568be7835..097e2d8585cc 100644
--- a/csrc/xpu/includes/simd.h
+++ b/csrc/xpu/includes/simd.h
@@ -13,7 +13,7 @@
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h
index fa41757c895b..1897afd1fea2 100644
--- a/csrc/xpu/includes/type_shim.h
+++ b/csrc/xpu/includes/type_shim.h
@@ -82,11 +82,11 @@
     }
 
 template <typename T>
-__inline__ __attribute__((always_inline)) T reduce_block_into_lanes(
-    T* x,
-    T val,
-    int lanes = 1,
-    bool share_result = false)  // lanes is intended to be <= 32.
+__inline__ __attribute__((always_inline)) T
+reduce_block_into_lanes(T* x,
+                        T val,
+                        int lanes = 1,
+                        bool share_result = false)  // lanes is intended to be <= 32.
 {
     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
     int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index a9039def3a40..63c03d0a08ae 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -84,7 +84,7 @@ def check_for_libaio_pkg(self):
                 break
         return found
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # Check for the existence of libaio by using distutils
         # to compile and link a test program that calls io_submit,
         # which is a function provided by libaio that is used in the async_io op.
diff --git a/op_builder/gds.py b/op_builder/gds.py
index 3b06ca16f40d..e024674e01d8 100644
--- a/op_builder/gds.py
+++ b/op_builder/gds.py
@@ -35,7 +35,7 @@ def include_paths(self):
     def extra_ldflags(self):
         return super().extra_ldflags() + ['-lcufile']
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         import torch.utils.cpp_extension
         CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
         CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
index 7afa5970d69f..53655994b560 100644
--- a/tests/unit/ops/aio/test_gds.py
+++ b/tests/unit/ops/aio/test_gds.py
@@ -47,7 +47,7 @@ def _get_test_write_file(tmpdir, index):
 def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
-    gds_handle.new_device_locked_tensor(test_buffer)
+    gds_handle.pin_device_tensor(test_buffer)
     return test_file, test_buffer
 
 
@@ -64,7 +64,6 @@ def _validate_handle_state(handle, single_submit, overlap_events):
 class TestRead(DistributedTest):
     world_size = 1
     reuse_dist_env = True
-    requires_cuda_env = False
     if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
@@ -74,7 +73,7 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events):
         h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
-        h.new_device_locked_tensor(gds_buffer)
+        h.pin_device_tensor(gds_buffer)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
@@ -86,14 +85,14 @@ def test_parallel_read(self, tmpdir, single_submit, overlap_events):
             ref_buffer = list(f.read())
         assert ref_buffer == gds_buffer.tolist()
 
-        h.free_device_locked_tensor(gds_buffer)
+        h.unpin_device_tensor(gds_buffer)
 
     def test_async_read(self, tmpdir, single_submit, overlap_events):
 
         h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
-        h.new_device_locked_tensor(gds_buffer)
+        h.pin_device_tensor(gds_buffer)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
@@ -108,7 +107,7 @@ def test_async_read(self, tmpdir, single_submit, overlap_events):
             ref_buffer = list(f.read())
         assert ref_buffer == gds_buffer.tolist()
 
-        h.free_device_locked_tensor(gds_buffer)
+        h.unpin_device_tensor(gds_buffer)
 
 
 @pytest.mark.parametrize("single_submit", [True, False])
@@ -116,7 +115,6 @@ def test_async_read(self, tmpdir, single_submit, overlap_events):
 class TestWrite(DistributedTest):
     world_size = 1
     reuse_dist_env = True
-    requires_cuda_env = False
     if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
@@ -133,7 +131,7 @@ def test_parallel_write(self, tmpdir, single_submit, overlap_events):
         write_status = h.sync_pwrite(gds_buffer, gds_file)
         assert write_status == 1
 
-        h.free_device_locked_tensor(gds_buffer)
+        h.unpin_device_tensor(gds_buffer)
 
         assert os.path.isfile(gds_file)
 
@@ -154,7 +152,7 @@ def test_async_write(self, tmpdir, single_submit, overlap_events):
         wait_status = h.wait()
         assert wait_status == 1
 
-        h.free_device_locked_tensor(gds_buffer)
+        h.unpin_device_tensor(gds_buffer)
 
         assert os.path.isfile(gds_file)
 
@@ -165,7 +163,6 @@ def test_async_write(self, tmpdir, single_submit, overlap_events):
 @pytest.mark.sequential
 class TestAsyncQueue(DistributedTest):
     world_size = 1
-    requires_cuda_env = False
     if not get_accelerator().is_available():
         init_distributed = False
         set_dist_env = False
@@ -186,7 +183,7 @@ def test_read(self, tmpdir, async_queue):
             torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) for _ in range(async_queue)
         ]
         for buf in gds_buffers:
-            h.new_device_locked_tensor(buf)
+            h.pin_device_tensor(buf)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
@@ -203,7 +200,7 @@ def test_read(self, tmpdir, async_queue):
             assert ref_buffer == gds_buffers[i].tolist()
 
         for t in gds_buffers:
-            h.free_device_locked_tensor(t)
+            h.unpin_device_tensor(t)
 
     @pytest.mark.parametrize("async_queue", [2, 3])
     def test_write(self, tmpdir, async_queue):
@@ -235,10 +232,39 @@ def test_write(self, tmpdir, async_queue):
         assert wait_status == async_queue
 
         for t in gds_buffers:
-            h.free_device_locked_tensor(t)
+            h.unpin_device_tensor(t)
 
         for i in range(async_queue):
             assert os.path.isfile(gds_files[i])
 
             filecmp.clear_cache()
             assert filecmp.cmp(ref_files[i], gds_files[i], shallow=False)
+
+
+@pytest.mark.parametrize("use_new_api", [True, False])
+class TestLockDeviceTensor(DistributedTest):
+    world_size = 2
+    reuse_dist_env = True
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_pin_device_tensor(self, use_new_api):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        unpinned_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        if use_new_api:
+            pinned_buffer = h.new_pinned_device_tensor(unpinned_buffer.numel(), unpinned_buffer)
+        else:
+            pinned_buffer = torch.empty_like(unpinned_buffer)
+            h.pin_device_tensor(pinned_buffer)
+
+        assert unpinned_buffer.device == pinned_buffer.device
+        assert unpinned_buffer.dtype == pinned_buffer.dtype
+        assert unpinned_buffer.numel() == pinned_buffer.numel()
+
+        if use_new_api:
+            h.free_pinned_device_tensor(pinned_buffer)
+        else:
+            h.unpin_device_tensor(pinned_buffer)

From 8baf546f096fd169c9c11d7057f9b17ab3a67b2c Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sun, 11 Aug 2024 21:08:39 -0400
Subject: [PATCH 20/31] Formatting fixes

---
 csrc/aio/common/deepspeed_aio_common.cpp               |  5 ++---
 csrc/aio/py_lib/deepspeed_py_aio.cpp                   | 10 ++++------
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp            | 10 ++++------
 .../evoformer_attn/gemm_kernel_utils.h                 |  9 ++++-----
 .../predicated_tile_access_iterator_residual_last.h    |  8 ++++----
 5 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 0f2895dfa328..a65cc500cc82 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -301,9 +301,8 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     } while (r > 0);
 
     if (read_bytes != num_bytes) {
-        std::cerr << "read error "
-                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
-                  << std::endl;
+        std::cerr << "read error " << " read_bytes (read) = " << read_bytes
+                  << " num_bytes (fstat) = " << num_bytes << std::endl;
     }
     assert(read_bytes == num_bytes);
     close(fd);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 30b6682ada72..eac268d33433 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -69,9 +69,8 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
@@ -115,8 +114,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index 8f2c82f86968..1ed3645276bd 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -91,9 +91,8 @@ int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, co
     if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
@@ -126,9 +125,8 @@ int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
index 2a4300c5cac1..c102234a4dfb 100644
--- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -125,11 +125,10 @@ struct CheckArch {
         std::cerr << #PTR " is not correctly aligned\n"; \
         return false;                                    \
     }
-#define EVOFORMER_CHECK(COND, ERR)                          \
-    if (!(COND)) {                                          \
-        std::cerr << "[Evoformer Attention]"                \
-                  << "'" #COND "' failed: " << ERR << "\n"; \
-        return false;                                       \
+#define EVOFORMER_CHECK(COND, ERR)                                                     \
+    if (!(COND)) {                                                                     \
+        std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                                                  \
     }
 #endif
 
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
index dcbdc11c27ad..7f6a2430845a 100644
--- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_access_iterator_residual_last.h
@@ -488,7 +488,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -695,7 +695,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
-        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))) {};
+        Params(Layout const& layout) : params_(layout::PitchLinear(layout.stride(0))){};
 
         /// Construct the Params object given a pitch-linear tensor's layout
         CUTLASS_HOST_DEVICE
@@ -1211,7 +1211,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {};
+            : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
     };
 
 private:
@@ -1413,7 +1413,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
         /// Construct the Params object given an AffineRankN<2> tensor's layout
         CUTLASS_HOST_DEVICE
         Params(Layout const& layout)
-            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {};
+            : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
     };
 
 private:

From 101a43d3b323427b107dc5c6d0c442ef6ad34869 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 12 Aug 2024 10:43:45 -0400
Subject: [PATCH 21/31] Avoid xpu regression (#4130)

---
 op_builder/async_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 63c03d0a08ae..63fc2c94529c 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -90,7 +90,7 @@ def is_compatible(self, verbose=False):
         # which is a function provided by libaio that is used in the async_io op.
         # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
         # respectively to specify the directories for libaio.h and libaio.so.
-        aio_compatible = self.has_function('io_submit', ('aio', ))
+        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
         if verbose and not aio_compatible:
             self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
 

From 76f8e1ce63f78b0f23878b065fe64f5c61193c25 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Mon, 12 Aug 2024 20:30:22 +0000
Subject: [PATCH 22/31] microbenchmark updates

---
 csrc/aio/py_test/ds_aio_handle.py  | 2 +-
 csrc/aio/py_test/run_read_sweep.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index a9c5a9d207d7..96c72f08027b 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -43,7 +43,7 @@ def pre_handle(args, tid, read_op):
 
     io_parallel = args.io_parallel if args.io_parallel else 1
     if gds:
-        handle = GDSBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel)
+        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel)
         handle.new_device_locked_tensor(buffer)
     else:
         handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel)
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index 7c2cb46f83fc..59d82996a0e2 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -95,7 +95,7 @@ for xtype in cpu gpu gds; do
                 for t in 1 2 4 8; do
                     for d in 8 16 32 64 128; do
                         for bs in 128K 256K 512K 1M 2M 4M 8M 16M; do
-                            SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme03:0"
+                            SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme01:0"
                             OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
                             LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
                             cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"

From 6c464031edbd2c01e7643f331342a3b6c0bd9df5 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 15 Aug 2024 11:15:11 -0400
Subject: [PATCH 23/31] Avoid passing class member num_threads

---
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 9 +++------
 csrc/aio/py_lib/deepspeed_py_aio_handle.h   | 1 -
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 5 ++---
 csrc/gds/py_lib/deepspeed_py_gds_handle.h   | 1 -
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index 1ed3645276bd..9c560621dc63 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -207,11 +207,10 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_create_io_op_desc(
     const int fd,
     const char* filename,
     const long long int file_num_bytes,
-    const int num_threads,
     const bool validate)
 {
     return std::make_shared<cpu_op_desc_t>(
-        read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+        read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
 }
 
 int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
@@ -238,8 +237,7 @@ int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
     const auto fd = open_file(filename, true);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op =
-        _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, _num_threads, validate);
+    auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate);
 
     _schedule_aio_work(scheduled_op);
 
@@ -261,8 +259,7 @@ int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
     const auto fd = open_file(filename, false);
     if (fd == -1) { return -1; }
 
-    auto scheduled_op =
-        _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, _num_threads, validate);
+    auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate);
 
     _schedule_aio_work(scheduled_op);
 
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index c89998974743..8f1d8f5d0514 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -81,6 +81,5 @@ struct deepspeed_aio_handle_t {
         const int fd,
         const char* filename,
         const long long int file_num_bytes,
-        const int num_threads,
         const bool validate);
 };
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index 94b89afb6941..c58b6da405ff 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -103,13 +103,12 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
     const int fd,
     const char* filename,
     const long long int file_num_bytes,
-    const int num_threads,
     const bool validate)
 {
     if (buffer.is_cuda()) {
         return std::make_shared<gds_op_desc_t>(
-            read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+            read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
     }
     return deepspeed_aio_handle_t::_create_io_op_desc(
-        read_op, buffer, fd, filename, file_num_bytes, num_threads, validate);
+        read_op, buffer, fd, filename, file_num_bytes, validate);
 }
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index e04f3b54da37..2d53b0efc190 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -38,7 +38,6 @@ struct deepspeed_gds_handle_t : deepspeed_aio_handle_t {
                                                             const int fd,
                                                             const char* filename,
                                                             const long long int file_num_bytes,
-                                                            const int num_threads,
                                                             const bool validate);
 
     static int s_cuFile_init;

From d0a222cd51e1657594f23e163d490aab4f6cc0b8 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 15 Aug 2024 11:53:57 -0400
Subject: [PATCH 24/31] Create abstract base class

---
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 280 +-----------------
 csrc/aio/py_lib/deepspeed_py_aio_handle.h   |  67 +----
 csrc/aio/py_lib/deepspeed_py_io_handle.cpp  | 300 ++++++++++++++++++++
 csrc/aio/py_lib/deepspeed_py_io_handle.h    |  85 ++++++
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp |   4 +-
 csrc/gds/py_lib/deepspeed_py_gds_handle.h   |   4 +-
 op_builder/async_io.py                      |  13 +-
 7 files changed, 400 insertions(+), 353 deletions(-)
 create mode 100644 csrc/aio/py_lib/deepspeed_py_io_handle.cpp
 create mode 100644 csrc/aio/py_lib/deepspeed_py_io_handle.h

diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index 9c560621dc63..c7ca5e82afde 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -12,289 +12,13 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 using namespace std;
 
-static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
-
 deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
                                                const int num_threads)
-    : _aio_ctxt(new aio_context(block_size, queue_depth)),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _num_threads(num_threads),
-      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0),
-      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
-{
-    for (auto i = 0; i < num_threads; ++i) {
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-    }
-
-    for (auto& ctxt : _thread_contexts) {
-        _threads.push_back(std::thread(_start_aio_thread, ctxt));
-    }
-}
-
-deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
-{
-    _stop_threads();
-    for (auto& thr : _threads) { thr.join(); }
-}
-
-const int deepspeed_aio_handle_t::get_block_size() const
-{
-    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
-}
-
-const int deepspeed_aio_handle_t::get_queue_depth() const
-{
-    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
-}
-
-const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
-
-const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
-
-const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
-
-int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    assert(_aio_ctxt);
-
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-
-    close(fd);
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
-    return 0;
-}
-
-int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate)
-{
-    assert(_aio_ctxt);
-
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
-    return 0;
-}
-
-void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
-{
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_work_queue.push(scheduled_op);
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-    _num_pending_ops++;
-}
-
-std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
-{
-    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
-    for (auto& ctxt : _thread_contexts) {
-        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
-        ctxt->_complete_sync._cond_var.wait(lock,
-                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
-        completed_op = ctxt->_complete_queue.front();
-        ctxt->_complete_queue.pop();
-    }
-    return completed_op;
-}
-
-void deepspeed_aio_handle_t::_stop_threads()
-{
-    assert(0 == _num_pending_ops);
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_time_to_exit = true;
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-}
-
-int deepspeed_aio_handle_t::wait()
-{
-    assert(_num_pending_ops > 0);
-    auto num_completed_ops = 0;
-
-    while (_num_pending_ops > 0) {
-        auto completed_op = _wait_for_aio_work();
-
-        if (completed_op->_validate) { completed_op->validate(); }
-
-        completed_op->finish();
-
-        close(completed_op->_fd);
-
-        --_num_pending_ops;
-        ++num_completed_ops;
-    }
-
-    return num_completed_ops;
-}
-
-bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                       const long long int num_bytes)
-{
-    const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
-        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_create_io_op_desc(
-    const bool read_op,
-    const torch::Tensor& buffer,
-    const int fd,
-    const char* filename,
-    const long long int file_num_bytes,
-    const bool validate)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
 {
-    return std::make_shared<cpu_op_desc_t>(
-        read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
 }
 
-int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate,
-                                  const bool async)
-{
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
-                                   const char* filename,
-                                   const bool validate,
-                                   const bool async)
-{
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, true);
-}
-
-int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, true);
-}
-
-at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem,
-                                                         const torch::Tensor& example_tensor)
-{
-    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
-}
-
-bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
-{
-    return _pinned_tensor_mgr->free(locked_tensor);
-}
+deepspeed_aio_handle_t::~deepspeed_aio_handle_t() {}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 8f1d8f5d0514..eb6b90ea22f0 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -9,21 +9,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 #include <condition_variable>
 #include <memory>
-#include "deepspeed_aio_thread.h"
-#include "deepspeed_pin_tensor.h"
-
-struct deepspeed_aio_handle_t {
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const int _num_threads;
-    deepspeed_aio_config_t _aio_config;
-
-    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
-    std::vector<std::thread> _threads;
-    int _num_pending_ops;
-    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
+#include "deepspeed_py_io_handle.h"
 
+struct deepspeed_aio_handle_t : deepspeed_io_handle_t {
     deepspeed_aio_handle_t(const int block_size,
                            const int queue_depth,
                            const bool single_submit,
@@ -31,55 +19,4 @@ struct deepspeed_aio_handle_t {
                            const int num_threads);
 
     ~deepspeed_aio_handle_t();
-
-    const int get_block_size() const;
-    const int get_queue_depth() const;
-    const bool get_single_submit() const;
-    const bool get_overlap_events() const;
-    const int get_thread_count() const;
-
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int pread(const torch::Tensor& buffer,
-              const char* filename,
-              const bool validate,
-              const bool async);
-
-    int pwrite(const torch::Tensor& buffer,
-               const char* filename,
-               const bool validate,
-               const bool async);
-
-    int sync_pread(torch::Tensor& buffer, const char* filename);
-
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int async_pread(torch::Tensor& buffer, const char* filename);
-
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    // TODO: Make API's args to be shape and dtype.
-    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
-
-    bool free_cpu_locked_tensor(torch::Tensor&);
-
-    int wait();
-
-    void _stop_threads();
-
-    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
-
-    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
-
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
-
-    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(
-        const bool read_op,
-        const torch::Tensor& buffer,
-        const int fd,
-        const char* filename,
-        const long long int file_num_bytes,
-        const bool validate);
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
new file mode 100644
index 000000000000..c31ca3040ee2
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -0,0 +1,300 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_io_handle.h"
+#include <cstdlib>
+
+using namespace std;
+
+static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
+
+deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int num_threads)
+    : _aio_ctxt(new aio_context(block_size, queue_depth)),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _num_threads(num_threads),
+      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
+      _num_pending_ops(0),
+      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
+{
+    for (auto i = 0; i < num_threads; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
+    }
+}
+
+deepspeed_io_handle_t::~deepspeed_io_handle_t()
+{
+    _stop_threads();
+    for (auto& thr : _threads) { thr.join(); }
+}
+
+const int deepspeed_io_handle_t::get_block_size() const
+{
+    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
+}
+
+const int deepspeed_io_handle_t::get_queue_depth() const
+{
+    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
+}
+
+const bool deepspeed_io_handle_t::get_single_submit() const { return _single_submit; }
+
+const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_events; }
+
+const int deepspeed_io_handle_t::get_thread_count() const { return _num_threads; }
+
+int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    assert(_aio_ctxt);
+
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+
+    close(fd);
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
+    return 0;
+}
+
+int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate)
+{
+    assert(_aio_ctxt);
+
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
+    return 0;
+}
+
+void deepspeed_io_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
+{
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_work_queue.push(scheduled_op);
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+    _num_pending_ops++;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_wait_for_aio_work()
+{
+    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
+    for (auto& ctxt : _thread_contexts) {
+        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
+        ctxt->_complete_sync._cond_var.wait(lock,
+                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
+        completed_op = ctxt->_complete_queue.front();
+        ctxt->_complete_queue.pop();
+    }
+    return completed_op;
+}
+
+void deepspeed_io_handle_t::_stop_threads()
+{
+    assert(0 == _num_pending_ops);
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_time_to_exit = true;
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+}
+
+int deepspeed_io_handle_t::wait()
+{
+    assert(_num_pending_ops > 0);
+    auto num_completed_ops = 0;
+
+    while (_num_pending_ops > 0) {
+        auto completed_op = _wait_for_aio_work();
+
+        if (completed_op->_validate) { completed_op->validate(); }
+
+        completed_op->finish();
+
+        close(completed_op->_fd);
+
+        --_num_pending_ops;
+        ++num_completed_ops;
+    }
+
+    return num_completed_ops;
+}
+
+bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op,
+                                                       const long long int num_bytes)
+{
+    const auto op_string = read_op ? "Read" : "Write";
+    if (num_bytes % get_thread_count()) {
+        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const int fd,
+    const char* filename,
+    const long long int file_num_bytes,
+    const bool validate)
+{
+    return std::make_shared<cpu_op_desc_t>(
+        read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
+}
+
+int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async)
+{
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    if (buffer_bytes != num_file_bytes) {
+        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
+                  << " != " << num_file_bytes << std::endl;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert((num_file_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
+                                   const char* filename,
+                                   const bool validate,
+                                   const bool async)
+{
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    assert((num_write_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, false);
+}
+
+int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, false);
+}
+
+int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, true);
+}
+
+int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, true);
+}
+
+at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const size_t num_elem,
+                                                         const torch::Tensor& example_tensor)
+{
+    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
+}
+
+bool deepspeed_io_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
+{
+    return _pinned_tensor_mgr->free(locked_tensor);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
new file mode 100644
index 000000000000..b9894fe36416
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_aio_thread.h"
+#include "deepspeed_pin_tensor.h"
+
+struct deepspeed_io_handle_t {
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const int _num_threads;
+    deepspeed_aio_config_t _aio_config;
+
+    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
+    std::vector<std::thread> _threads;
+    int _num_pending_ops;
+    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
+
+    deepspeed_io_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int num_threads);
+
+    virtual ~deepspeed_io_handle_t() = 0;
+
+    const int get_block_size() const;
+    const int get_queue_depth() const;
+    const bool get_single_submit() const;
+    const bool get_overlap_events() const;
+    const int get_thread_count() const;
+
+    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int pread(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const bool async);
+
+    int pwrite(const torch::Tensor& buffer,
+               const char* filename,
+               const bool validate,
+               const bool async);
+
+    int sync_pread(torch::Tensor& buffer, const char* filename);
+
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int async_pread(torch::Tensor& buffer, const char* filename);
+
+    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    // TODO: Make API's args to be shape and dtype.
+    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
+
+    bool free_cpu_locked_tensor(torch::Tensor&);
+
+    int wait();
+
+    void _stop_threads();
+
+    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
+
+    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
+
+    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+
+    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(
+        const bool read_op,
+        const torch::Tensor& buffer,
+        const int fd,
+        const char* filename,
+        const long long int file_num_bytes,
+        const bool validate);
+};
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index c58b6da405ff..3a35ad3145a0 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -20,7 +20,7 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
                                                const bool single_submit,
                                                const bool overlap_events,
                                                const int num_threads)
-    : deepspeed_aio_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
 {
     _init_cuFile(block_size, queue_depth, num_threads);
 }
@@ -109,6 +109,6 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
         return std::make_shared<gds_op_desc_t>(
             read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
     }
-    return deepspeed_aio_handle_t::_create_io_op_desc(
+    return deepspeed_io_handle_t::_create_io_op_desc(
         read_op, buffer, fd, filename, file_num_bytes, validate);
 }
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index 2d53b0efc190..f324e6b65e80 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -9,9 +9,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 #include <condition_variable>
 #include <memory>
-#include "deepspeed_py_aio_handle.h"
+#include "deepspeed_py_io_handle.h"
 
-struct deepspeed_gds_handle_t : deepspeed_aio_handle_t {
+struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
     deepspeed_gds_handle_t(const int block_size,
                            const int queue_depth,
                            const bool single_submit,
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 63fc2c94529c..08413517b134 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -22,11 +22,12 @@ def absolute_name(self):
 
     def lib_sources(self):
         src_list = [
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
-            'csrc/aio/py_lib/deepspeed_cpu_op.cpp', 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp',
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
         return src_list
 
@@ -90,7 +91,7 @@ def is_compatible(self, verbose=False):
         # which is a function provided by libaio that is used in the async_io op.
         # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
         # respectively to specify the directories for libaio.h and libaio.so.
-        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
+        aio_compatible = self.has_function('io_submit', ('aio', ))
         if verbose and not aio_compatible:
             self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
 

From 01f74571874ce4419241d56bc876e7ed6a18ffa0 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Thu, 15 Aug 2024 16:24:08 +0000
Subject: [PATCH 25/31] Remove pdb

---
 csrc/gds/py_test/validate_gds.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py
index ea306f287ae6..b34b1194f582 100644
--- a/csrc/gds/py_test/validate_gds.py
+++ b/csrc/gds/py_test/validate_gds.py
@@ -6,8 +6,5 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 from deepspeed.ops.op_builder import GDSBuilder
-import pdb
-
-pdb.set_trace()
-assert GDSBuilder().is_compatible(False)
-# assert GDSBuilder().load(True)
+assert GDSBuilder().is_compatible(True)
+assert GDSBuilder().load(True)

From ccc7d18323409541774f3f0e0c25eb196baba3e4 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 15 Aug 2024 15:00:18 -0400
Subject: [PATCH 26/31] Formatting

---
 csrc/aio/py_lib/deepspeed_py_io_handle.cpp | 28 +++++++++++-----------
 csrc/aio/py_lib/deepspeed_py_io_handle.h   |  8 +++----
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
index c31ca3040ee2..bdf2a858d797 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -15,10 +15,10 @@ using namespace std;
 static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
 
 deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
-                                               const int queue_depth,
-                                               const bool single_submit,
-                                               const bool overlap_events,
-                                               const int num_threads)
+                                             const int queue_depth,
+                                             const bool single_submit,
+                                             const bool overlap_events,
+                                             const int num_threads)
     : _aio_ctxt(new aio_context(block_size, queue_depth)),
       _single_submit(single_submit),
       _overlap_events(overlap_events),
@@ -97,8 +97,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con
 }
 
 int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate)
+                                 const char* filename,
+                                 const bool validate)
 {
     assert(_aio_ctxt);
 
@@ -189,7 +189,7 @@ int deepspeed_io_handle_t::wait()
 }
 
 bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                       const long long int num_bytes)
+                                                      const long long int num_bytes)
 {
     const auto op_string = read_op ? "Read" : "Write";
     if (num_bytes % get_thread_count()) {
@@ -214,9 +214,9 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
 }
 
 int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate,
-                                  const bool async)
+                                 const char* filename,
+                                 const bool validate,
+                                 const bool async)
 {
     long long num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
@@ -247,9 +247,9 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
 }
 
 int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
-                                   const char* filename,
-                                   const bool validate,
-                                   const bool async)
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async)
 {
     const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
     assert((num_write_bytes % _num_threads) == 0);
@@ -289,7 +289,7 @@ int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char*
 }
 
 at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const size_t num_elem,
-                                                         const torch::Tensor& example_tensor)
+                                                        const torch::Tensor& example_tensor)
 {
     return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
 }
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
index b9894fe36416..2974ebe87bfc 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -25,10 +25,10 @@ struct deepspeed_io_handle_t {
     std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
 
     deepspeed_io_handle_t(const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const int num_threads);
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const int num_threads);
 
     virtual ~deepspeed_io_handle_t() = 0;
 

From 5043f43a476ff1ffbc5b443485ed56b698724cfd Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Thu, 15 Aug 2024 19:12:39 +0000
Subject: [PATCH 27/31] updating sweep script with gds

---
 csrc/aio/py_test/aio_bench_perf_sweep.py | 9 ++++++---
 csrc/aio/py_test/ds_aio_handle.py        | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index 5d4172066a6b..24339ea5a54e 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -38,10 +38,11 @@ def __init__(self, args):
         self.write = not args.no_write
         self.flush_cache = not args.no_sudo
         self.log_dir = args.log_dir
+        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
         if args.gpu:
-            self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size} --gpu'
-        else:
-            self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
+            self.other_options += ' --gpu'
+        if args.gds:
+            self.other_options += ' --use_gds'
 
 
 def parse_arguments():
@@ -65,6 +66,8 @@ def parse_arguments():
 
     parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.')
 
+    parser.add_argument('--gds', action='store_true', help='Run the sweep over NVIDIA GPUDirectStorage operator')
+
     parser.add_argument(
         '--no_sudo',
         action='store_true',
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 721daa96a91a..0b8a58ab1c5c 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -44,7 +44,7 @@ def pre_handle(args, tid, read_op):
     io_parallel = args.io_parallel if args.io_parallel else 1
     if gds:
         handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel)
-        handle.new_device_locked_tensor(buffer)
+        handle.pin_device_tensor(buffer)
     else:
         handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
                                                     not args.sequential_requests, io_parallel)
@@ -79,7 +79,7 @@ def post_handle(pool_params):
     for buf in [BUFFER, BOUNCE_BUFFER]:
         if ctxt[buf] is not None:
             if ctxt['gds']:
-                ctxt['handle'].free_device_locked_tensor(ctxt[buf])
+                ctxt['handle'].unpin_device_tensor(ctxt[buf])
             ctxt[buf].detach()
             ctxt[buf] = None
     return ctxt

From 4833c03ab9d89f5a6d31cef19087398614c49143 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 15 Aug 2024 15:52:23 -0700
Subject: [PATCH 28/31] Formatting

---
 csrc/aio/py_test/ds_aio_handle.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 0b8a58ab1c5c..f4a179deb9ec 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -43,7 +43,8 @@ def pre_handle(args, tid, read_op):
 
     io_parallel = args.io_parallel if args.io_parallel else 1
     if gds:
-        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,not args.sequential_requests, io_parallel)
+        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
         handle.pin_device_tensor(buffer)
     else:
         handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,

From 3b400423c92c967e4e0a59cada997de47cd2cb0c Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Thu, 15 Aug 2024 23:35:14 +0000
Subject: [PATCH 29/31] cpp std based on torch version

---
 op_builder/builder.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index ca4b339e2447..3f962d3dfee3 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -746,10 +746,15 @@ def hipify_extension(self):
             )
 
     def cxx_args(self):
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
+            CPP_STD = '-std=c++17'
+        else:
+            CPP_STD = '-std=c++14'
         if sys.platform == "win32":
             return ['-O2']
         else:
-            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
+            return ['-O3', CPP_STD, '-g', '-Wno-reorder']
 
     def nvcc_args(self):
         if self.build_for_cpu:

From a75010b101defb87fb8ca65a4bd5ce0e63383e82 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Fri, 16 Aug 2024 00:57:39 +0000
Subject: [PATCH 30/31] moving torch check to aio only

---
 op_builder/async_io.py | 5 +++++
 op_builder/builder.py  | 7 +------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 08413517b134..e7f16adbf2a3 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -51,6 +51,11 @@ def include_paths(self):
     def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
         args = super().cxx_args()
+        import torch
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1):
+            args.remove('-std=c++17')
+            args.append('-std=c++14')
         args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder']
         return args
 
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 3f962d3dfee3..ca4b339e2447 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -746,15 +746,10 @@ def hipify_extension(self):
             )
 
     def cxx_args(self):
-        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
-        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
-            CPP_STD = '-std=c++17'
-        else:
-            CPP_STD = '-std=c++14'
         if sys.platform == "win32":
             return ['-O2']
         else:
-            return ['-O3', CPP_STD, '-g', '-Wno-reorder']
+            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
 
     def nvcc_args(self):
         if self.build_for_cpu:

From 9a36d8be4b7de406f223a271c25625e2b276a510 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 16 Aug 2024 06:19:09 -0400
Subject: [PATCH 31/31] Usability improvements

---
 csrc/aio/py_lib/py_ds_aio.cpp            |  87 ++++++++++++++----
 csrc/aio/py_test/aio_bench_perf_sweep.py |  11 ++-
 csrc/aio/py_test/ds_aio_args.py          |   2 +-
 csrc/gds/py_lib/py_ds_gds.cpp            | 111 +++++++++++++++++++----
 4 files changed, 169 insertions(+), 42 deletions(-)
 mode change 100755 => 100644 csrc/aio/py_lib/py_ds_aio.cpp

diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
old mode 100755
new mode 100644
index 78e91c1607b1..3171d0c6bf3c
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -23,11 +23,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
         .def(py::init<const int, const int, const bool, const bool, const int>(),
              "AIO handle constructor",
-             "block_size"_a,
-             "queue_depth"_a,
-             "single_submit"_a,
-             "overlap_events"_a,
-             "num_threads"_a)
+             "block_size"_a = 1024 * 1024,
+             "queue_depth"_a = 128,
+             "single_submit"_a = false,
+             "overlap_events"_a = false,
+             "num_threads"_a = 1)
 
         .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
@@ -35,19 +35,74 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
         .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
         .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
 
-        .def("read", &deepspeed_aio_handle_t::read)
-        .def("write", &deepspeed_aio_handle_t::write)
+        .def("read",
+             &deepspeed_aio_handle_t::read,
+             "Synchronous and non-parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
 
-        .def("pread", &deepspeed_aio_handle_t::pread)
-        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
+        .def("write",
+             &deepspeed_aio_handle_t::write,
+             "Synchronous and non-parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
 
-        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
-        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
-        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
-        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+        .def("pread",
+             &deepspeed_aio_handle_t::pread,
+             "Parallel file read with option of parallelism. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
 
-        .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
-        .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
+        .def("pwrite",
+             &deepspeed_aio_handle_t::pwrite,
+             "Parallel file write with option of parallelism. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
 
-        .def("wait", &deepspeed_aio_handle_t::wait);
+        .def("sync_pread",
+             &deepspeed_aio_handle_t::sync_pread,
+             "Synchrononous parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("sync_pwrite",
+             &deepspeed_aio_handle_t::sync_pwrite,
+             "Synchronous parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pread",
+             &deepspeed_aio_handle_t::async_pread,
+             "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
+             "following wait() returns count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pwrite",
+             &deepspeed_aio_handle_t::async_pwrite,
+             "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
+             "count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("new_cpu_locked_tensor",
+             &deepspeed_aio_handle_t::new_cpu_locked_tensor,
+             "Allocate pinned CPU tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_cpu_locked_tensor",
+             &deepspeed_aio_handle_t::free_cpu_locked_tensor,
+             "Free pinned CPU tensor.",
+             "tensor"_a)
+
+        .def("wait",
+             &deepspeed_aio_handle_t::wait,
+             "Wait for (ongoing) asynchronous operations to complete");
 }
diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index 24339ea5a54e..ba95150b11e1 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -20,11 +20,11 @@
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
 DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K", "256K"],
-    "queue_depth": [4, 16, 32],
+    "block_size": ["128K", "1M"],
+    "queue_depth": [32, 64, 128],
     "sequential_requests": [True, False],
     "single_submit": [False],
-    "io_parallel": [2, 8]
+    "io_parallel": [1, 2, 8],
 }
 
 
@@ -95,8 +95,9 @@ def dump_cmd_lines(cmd_lines):
 
 
 def get_ftd_map(nvme_dir_list):
-    dir_list = [' '.join(nvme_dir_list[:(i + 1)]) for i in range(len(nvme_dir_list))]
-    return {'folder_to_device_mapping': dir_list}
+    ftd_list = [f'{dir}:{dev}' for dev, dir in enumerate(nvme_dir_list)]
+    ftd_arg = [' '.join(ftd for ftd in ftd_list)]
+    return {'folder_to_device_mapping': ftd_arg}
 
 
 def get_sweep_config_dict(sweep_config_json):
diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py
index 5fc3098d3357..346feabe4810 100644
--- a/csrc/aio/py_test/ds_aio_args.py
+++ b/csrc/aio/py_test/ds_aio_args.py
@@ -106,7 +106,7 @@ def parse_arguments():
                         nargs='+',
                         help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).'
                         'Can be specified multiple times for multi-process runs,'
-                        'e.g. --path_map /mnt/nvme0:0 --path_map /mnt/nvme1:15 --gpu'
+                        'e.g. --folder_to_device_mapping /mnt/nvme0:0 --folder_to_device_mapping /mnt/nvme1:15 --gpu'
                         'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15')
 
     parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
index 10a7da1535ed..66eb34d4ea8c 100644
--- a/csrc/gds/py_lib/py_ds_gds.cpp
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -16,11 +16,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     py::class_<deepspeed_gds_handle_t>(m, "gds_handle")
         .def(py::init<const int, const int, const bool, const bool, const int>(),
              "GDS handle constructor",
-             "block_size"_a,
-             "queue_depth"_a,
-             "single_submit"_a,
-             "overlap_events"_a,
-             "num_threads"_a)
+             "block_size"_a = 1024 * 1024,
+             "queue_depth"_a = 128,
+             "single_submit"_a = false,
+             "overlap_events"_a = false,
+             "num_threads"_a = 1)
 
         .def("get_block_size", &deepspeed_gds_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth)
@@ -28,24 +28,95 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
         .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events)
         .def("get_thread_count", &deepspeed_gds_handle_t::get_thread_count)
 
-        .def("read", &deepspeed_gds_handle_t::read)
-        .def("write", &deepspeed_gds_handle_t::write)
+        .def("read",
+             &deepspeed_gds_handle_t::read,
+             "Synchronous and non-parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
 
-        .def("pread", &deepspeed_gds_handle_t::pread)
-        .def("pwrite", &deepspeed_gds_handle_t::pwrite)
+        .def("write",
+             &deepspeed_gds_handle_t::write,
+             "Synchronous and non-parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
 
-        .def("sync_pread", &deepspeed_gds_handle_t::sync_pread)
-        .def("sync_pwrite", &deepspeed_gds_handle_t::sync_pwrite)
-        .def("async_pread", &deepspeed_gds_handle_t::async_pread)
-        .def("async_pwrite", &deepspeed_gds_handle_t::async_pwrite)
+        .def("pread",
+             &deepspeed_gds_handle_t::pread,
+             "Parallel file read with option of parallelism. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
 
-        .def("new_cpu_locked_tensor", &deepspeed_gds_handle_t::new_cpu_locked_tensor)
-        .def("free_cpu_locked_tensor", &deepspeed_gds_handle_t::free_cpu_locked_tensor)
+        .def("pwrite",
+             &deepspeed_gds_handle_t::pwrite,
+             "Parallel file write with option of parallelism. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
 
-        .def("new_pinned_device_tensor", &deepspeed_gds_handle_t::new_pinned_device_tensor)
-        .def("free_pinned_device_tensor", &deepspeed_gds_handle_t::free_pinned_device_tensor)
-        .def("pin_device_tensor", &deepspeed_gds_handle_t::pin_device_tensor)
-        .def("unpin_device_tensor", &deepspeed_gds_handle_t::unpin_device_tensor)
+        .def("sync_pread",
+             &deepspeed_gds_handle_t::sync_pread,
+             "Synchrononous parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a)
 
-        .def("wait", &deepspeed_gds_handle_t::wait);
+        .def("sync_pwrite",
+             &deepspeed_gds_handle_t::sync_pwrite,
+             "Synchronous parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pread",
+             &deepspeed_gds_handle_t::async_pread,
+             "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
+             "following wait() returns count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pwrite",
+             &deepspeed_gds_handle_t::async_pwrite,
+             "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
+             "count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("new_cpu_locked_tensor",
+             &deepspeed_gds_handle_t::new_cpu_locked_tensor,
+             "Allocate pinned CPU tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_cpu_locked_tensor",
+             &deepspeed_gds_handle_t::free_cpu_locked_tensor,
+             "Free pinned CPU tensor.",
+             "tensor"_a)
+
+        .def("new_pinned_device_tensor",
+             &deepspeed_gds_handle_t::new_pinned_device_tensor,
+             "Allocate pinned device tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_pinned_device_tensor",
+             &deepspeed_gds_handle_t::free_pinned_device_tensor,
+             "Free pinned device tensor.",
+             "tensor"_a)
+
+        .def("pin_device_tensor",
+             &deepspeed_gds_handle_t::pin_device_tensor,
+             "Pin device tensor.",
+             "tensor"_a)
+
+        .def("unpin_device_tensor",
+             &deepspeed_gds_handle_t::unpin_device_tensor,
+             "Unpin device tensor.",
+             "tensor"_a)
+
+        .def("wait",
+             &deepspeed_gds_handle_t::wait,
+             "Wait for (ongoing) asynchronous operations to complete");
 }