From bb04e1acb29c4e39b91132a0da6db92740d89f65 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Fri, 13 Dec 2024 16:26:15 +0100 Subject: [PATCH] Refactor and fix GPU array index generation --- dace/codegen/targets/cpp.py | 1 + dace/codegen/targets/cuda.py | 10 +++++++--- dace/codegen/targets/framecode.py | 2 +- dace/data.py | 4 ++-- dace/dtypes.py | 6 ++++++ tests/deferred_alloc_test.py | 32 ++++++++++++------------------- 6 files changed, 29 insertions(+), 26 deletions(-) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 8357ca1fa8..ed7c7bba49 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -631,6 +631,7 @@ def _get_realloc_dimensions(size_array_name:str, new_size_array_name:str, shape) f"{size_array_name}[{i}] = {new_size_array_name}[{i}];" ) else: + old_size_strs.append(sym2cpp(shape[i])) new_size_strs.append(sym2cpp(shape[i])) return size_assignment_strs, new_size_strs, old_size_strs diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 418cbbfdbd..6d77daa219 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1632,6 +1632,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub size_desc = sdfg.arrays[size_desc_name] dyn_args_typed.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size") needed_size_scalars_declaration.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];") + #raise Exception(needed_size_scalars_declaration, dyn_args) self._localcode.write( '__global__ void %s %s(%s) {\n' % @@ -2065,6 +2066,9 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # handle dynamic map inputs for e in dace.sdfg.dynamic_map_inputs(cfg.node(state_id), dfg_scope.source_nodes()[0]): + # If src is a _read_size, it was handled before + if e.src_conn is not None and e.src_conn == "_read_size": + continue kernel_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), cfg, state_id, @@ -2810,14 +2814,14 @@ def reallocate( tmp_storage_name = "__tmp_realloc_move_storage" callsite_stream.write(f"if ({dst_node.data} == nullptr) {{", cfg, state_id, dst_node.guid) - if data.storage == dtypes.StorageType.GPU_Global: - assert data.storage == dtypes.StorageType.CPU_Pinned + if data.storage == dtypes.StorageType.GPU_Global: self._alloc_gpu_global(dst_node, data, callsite_stream, data_name, new_size_str) else: + assert data.storage == dtypes.StorageType.CPU_Pinned callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast(&{data_name}), {new_size_str}));", cfg, state_id, dst_node.guid) callsite_stream.write("} else {\n", cfg, state_id, dst_node.guid) callsite_stream.write(f"{dtype}* {tmp_storage_name};") - if data.storage == dtypes.StorageType.GPU_Global: + if data.storage == dtypes.StorageType.GPU_Global: self._alloc_gpu_global(None, data, callsite_stream, tmp_storage_name, new_size_str) else: assert data.storage == dtypes.StorageType.CPU_Pinned diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 4e4f4bc372..47a7ab03a4 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -969,7 +969,7 @@ def generate_code(self, size_nodedesc = sdfg.arrays[size_desc_name] assert ("__return" not in size_desc_name) ctypedef = size_nodedesc.dtype.ctype - array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name] + array = [v for v in sdfg.arrays.values() if type(v) == data.Array and v.size_desc_name is not None and v.size_desc_name == size_desc_name] assert len(array) <= 1 if len(array) == 1: array = array[0] diff --git a/dace/data.py b/dace/data.py index a3b008f150..509da50cf6 100644 --- a/dace/data.py +++ b/dace/data.py @@ -183,7 +183,7 @@ def _transient_setter(self, value): default=dtypes.AllocationLifetime.Scope) location = DictProperty(key_type=str, value_type=str, desc='Full storage location identifier (e.g., rank, GPU ID)') debuginfo = DebugInfoProperty(allow_none=True) - size_desc_name = Property(dtype=str, default=None, allow_none=True) + def __init__(self, dtype, shape, transient, storage, location, lifetime, debuginfo): self.dtype = dtype @@ -193,7 +193,6 @@ def __init__(self, dtype, shape, transient, storage, location, lifetime, debugin self.location = location if location is not None else {} self.lifetime = lifetime self.debuginfo = debuginfo - self.size_desc_name = None self._validate() def __call__(self): @@ -1387,6 +1386,7 @@ class Array(Data): 'it is inferred by other properties and the OptionalArrayInference pass.') pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred') + size_desc_name = Property(dtype=str, default=None, allow_none=True, desc="Name of the size desc, not None only for reallocatable storage types that are also transient") is_size_array = Property(dtype=bool, default=False, desc='Special array that is used to track the size of an another array') is_deferred_array = Property(dtype=bool, default=False, desc='Array that requires deferred allocation') diff --git a/dace/dtypes.py b/dace/dtypes.py index 465e73b2b1..d09bfd4210 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -103,6 +103,12 @@ class ScheduleType(aenum.AutoNumberEnum): StorageType.FPGA_ShiftRegister, ] +REALLOCATABLE_STORAGES = [ + StorageType.CPU_Heap, + StorageType.CPU_Pinned, + StorageType.GPU_Global, +] + @undefined_safe_enum class ReductionType(aenum.AutoNumberEnum): diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py index 1d9df3a200..73ddb7d40a 100644 --- a/tests/deferred_alloc_test.py +++ b/tests/deferred_alloc_test.py @@ -369,7 +369,7 @@ def test_conditional_alloc_cpu_pinned(): size2 = numpy.array([22, 22]).astype(numpy.uint64) arr = numpy.array([-1.0]).astype(numpy.float32) sdfg(path=1, size1=size1, size2=size2, example_array=arr) - assert ( arr.get()[0] == 3.0 ) + assert ( arr[0] == 3.0 ) def test_conditional_alloc_cpu(): sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore) @@ -403,7 +403,7 @@ def test_conditional_alloc_with_expr_cpu_pinned(): size2 = numpy.array([22, 22]).astype(numpy.uint64) arr = numpy.array([-1.0]).astype(numpy.float32) sdfg(path=1, size1=size1, size2=size2, example_array=arr) - assert ( arr.get()[0] == 3.0 ) + assert ( arr[0] == 3.0 ) def test_conditional_alloc_with_expr_cpu(): sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore, True) @@ -443,32 +443,24 @@ def test_incomplete_write_dimensions_2(): print(f"Trivial Realloc with storage, cpu") test_trivial_realloc_cpu(True) - print(f"Trivial Realloc-Use with storage, cpu") - test_realloc_use_cpu(True) - print(f"Trivial Realloc within map, cpu pinned") - test_realloc_use_cpu_pinned(True) - print(f"Trivial Realloc with storage, gpu") test_trivial_realloc_gpu(True) - print(f"Trivial Realloc-Use with storage, gpu") - test_realloc_use_gpu(True) - print(f"Trivial Realloc-Use with storage, cpu pinned") - test_realloc_use_cpu_pinned(True) + print(f"Trivial Realloc with storage, cpu pinned") + test_trivial_realloc_cpu_pinned(True) print(f"Trivial Realloc with storage, cpu, on non-transient data") test_trivial_realloc_cpu(False) - print(f"Trivial Realloc-Use with storage, cpu, on non-transient data") - test_realloc_use_cpu(False) - print(f"Trivial Realloc with storage, gpu, on non-transient data") - test_trivial_realloc_gpu(False) - print(f"Trivial Realloc-Use with storage, gpu, on non-transient data") - test_realloc_use_gpu(False) + test_trivial_realloc_gpu(False) print(f"Trivial Realloc with storage, cpu pinned, on non-transient data") test_trivial_realloc_cpu_pinned(False) - print(f"Trivial Realloc-Use with storage, cpu pinned, on non-transient data") - test_realloc_use_cpu_pinned(False) + print(f"Trivial Realloc-Use with storage, cpu") + test_realloc_use_cpu(True) + print(f"Trivial Realloc-Use with storage, gpu") + test_realloc_use_gpu(True) + print(f"Trivial Realloc-Use with storage, cpu pinned") + test_realloc_use_cpu_pinned(True) print(f"Realloc with incomplete write one, validation") test_incomplete_write_dimensions_1() print(f"Realloc with incomplete write two, validation") @@ -486,4 +478,4 @@ def test_incomplete_write_dimensions_2(): print(f"Test conditional alloc with use and the shape as a non-trivial expression, gpu") test_conditional_alloc_with_expr_gpu() print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu pinned") - test_conditional_alloc_with_expr_cpu_pinned() \ No newline at end of file + test_conditional_alloc_with_expr_cpu_pinned()