diff --git a/dev/cuda/benchmark_on_modal.py b/dev/cuda/benchmark_on_modal.py index 907a831ad..4eed462ca 100644 --- a/dev/cuda/benchmark_on_modal.py +++ b/dev/cuda/benchmark_on_modal.py @@ -3,19 +3,22 @@ This is useful for folks who do not have access to expensive GPUs locally. Example usage for cuda kernels: GPU_MEM=80 modal run benchmark_on_modal.py \ - --compile-command "nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas" \ + --data-command="./dev/download_starter_pack.sh" \ + --compile-command "nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas -lcublasLt" \ --run-command "./attention_forward 1" OR if you want to use cuDNN etc. For training the gpt2 model with cuDNN use: GPU_MEM=80 modal run dev/cuda/benchmark_on_modal.py \ - --compile-command "make train_gpt2cu USE_CUDNN=1" + --data-command="./dev/download_starter_pack.sh" \ + --compile-command "make train_gpt2cu USE_CUDNN=1" \ --run-command "./train_gpt2cu -i dev/data/tinyshakespeare/tiny_shakespeare_train.bin -j dev/data/tinyshakespeare/tiny_shakespeare_val.bin -v 250 -s 250 -g 144 -f shakespeare.log -b 4" For profiling using nsight system: GPU_MEM=80 modal run dev/cuda/benchmark_on_modal.py \ + --data-command="./dev/download_starter_pack.sh" \ --compile-command "make train_gpt2cu USE_CUDNN=1" \ --run-command "nsys profile --cuda-graph-trace=graph --python-backtrace=cuda --cuda-memory-usage=true \ ./train_gpt2cu -i dev/data/tinyshakespeare/tiny_shakespeare_train.bin \ @@ -62,10 +65,10 @@ "rm cmake-3.28.1-Linux-x86_64.sh", "ln -s /usr/local/bin/cmake /usr/bin/cmake",) .run_commands( - "apt-get install -y --allow-change-held-packages libcudnn8 libcudnn8-dev", + "apt-get install -y --allow-change-held-packages libcudnn9-cuda-12 libcudnn9-dev-cuda-12", "apt-get install -y openmpi-bin openmpi-doc libopenmpi-dev kmod sudo", "git clone https://github.com/NVIDIA/cudnn-frontend.git /root/cudnn-frontend", - "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make" + "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make -j$(nproc)" ) .run_commands( "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \ @@ -75,6 +78,8 @@ apt-get update" ).run_commands( "apt-get install -y nsight-systems-2023.3.3" + ).run_commands( + "apt-get install -y curl" ) ) @@ -98,16 +103,16 @@ def execute_command(command: str): # using in a directory in your volume, where the name contains the timestamp unique id. # This script will generate a "report1_{timestamp} folder in volume" # and you can download it with 'modal volume get {volume-name} report1_{timestamp} - volumes={"/cuda-env": modal.Volume.from_name("cuda-env")}, + volumes={"/llmc": modal.Volume.from_name("llmc")}, ) -def run_benchmark(compile_command: str, run_command: str): +def run_benchmark(data_command: str, compile_command: str, run_command: str): execute_command("pwd") execute_command("ls") + execute_command(data_command) execute_command(compile_command) execute_command(run_command) # Use this section if you want to profile using nsight system and install the reports on your volume to be locally downloaded timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - execute_command("mkdir report1_" + timestamp) execute_command("mv /root/report1.nsys-rep /root/report1_" + timestamp + "/") execute_command("mv /root/report1.qdstrm /root/report1_" + timestamp + "/") @@ -116,6 +121,6 @@ def run_benchmark(compile_command: str, run_command: str): return None @stub.local_entrypoint() -def inference_main(compile_command: str, run_command: str): - results = run_benchmark.remote(compile_command, run_command) +def inference_main(data_command: str, compile_command: str, run_command: str): + results = run_benchmark.remote(data_command, compile_command, run_command) return results \ No newline at end of file diff --git a/llmc/cuda_common.h b/llmc/cuda_common.h index 6f5bf6564..d5d2f8289 100644 --- a/llmc/cuda_common.h +++ b/llmc/cuda_common.h @@ -179,7 +179,7 @@ inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffe // prime the read buffer; char* gpu_write_ptr = (char*)dest; size_t copy_amount = std::min(buffer_size, num_bytes); - freadCheck(read_buffer, 1, copy_amount, src); + // freadCheck(read_buffer, 1, copy_amount, src); size_t rest_bytes = num_bytes - copy_amount; size_t write_buffer_size = copy_amount; @@ -192,7 +192,7 @@ inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffe cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream)); gpu_write_ptr += write_buffer_size; // while this is going on, read from disk - freadCheck(read_buffer, 1, copy_amount, src); + //freadCheck(read_buffer, 1, copy_amount, src); cudaCheck(cudaStreamSynchronize(stream)); // wait for both buffers to be ready. std::swap(read_buffer, write_buffer);