Skip to content

Commit

Permalink
Refactor and fix GPU array index generation
Browse files Browse the repository at this point in the history
  • Loading branch information
ThrudPrimrose committed Dec 13, 2024
1 parent ae08459 commit bb04e1a
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 26 deletions.
1 change: 1 addition & 0 deletions dace/codegen/targets/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,7 @@ def _get_realloc_dimensions(size_array_name:str, new_size_array_name:str, shape)
f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
)
else:
old_size_strs.append(sym2cpp(shape[i]))
new_size_strs.append(sym2cpp(shape[i]))
return size_assignment_strs, new_size_strs, old_size_strs

Expand Down
10 changes: 7 additions & 3 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1632,6 +1632,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
size_desc = sdfg.arrays[size_desc_name]
dyn_args_typed.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size")
needed_size_scalars_declaration.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
#raise Exception(needed_size_scalars_declaration, dyn_args)

self._localcode.write(
'__global__ void %s %s(%s) {\n' %
Expand Down Expand Up @@ -2065,6 +2066,9 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S

# handle dynamic map inputs
for e in dace.sdfg.dynamic_map_inputs(cfg.node(state_id), dfg_scope.source_nodes()[0]):
# If src is a _read_size, it was handled before
if e.src_conn is not None and e.src_conn == "_read_size":
continue
kernel_stream.write(
self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
cfg, state_id,
Expand Down Expand Up @@ -2810,14 +2814,14 @@ def reallocate(
tmp_storage_name = "__tmp_realloc_move_storage"

callsite_stream.write(f"if ({dst_node.data} == nullptr) {{", cfg, state_id, dst_node.guid)
if data.storage == dtypes.StorageType.GPU_Global:
assert data.storage == dtypes.StorageType.CPU_Pinned
if data.storage == dtypes.StorageType.GPU_Global:
self._alloc_gpu_global(dst_node, data, callsite_stream, data_name, new_size_str)
else:
assert data.storage == dtypes.StorageType.CPU_Pinned
callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast<void**>(&{data_name}), {new_size_str}));", cfg, state_id, dst_node.guid)
callsite_stream.write("} else {\n", cfg, state_id, dst_node.guid)
callsite_stream.write(f"{dtype}* {tmp_storage_name};")
if data.storage == dtypes.StorageType.GPU_Global:
if data.storage == dtypes.StorageType.GPU_Global:
self._alloc_gpu_global(None, data, callsite_stream, tmp_storage_name, new_size_str)
else:
assert data.storage == dtypes.StorageType.CPU_Pinned
Expand Down
2 changes: 1 addition & 1 deletion dace/codegen/targets/framecode.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@ def generate_code(self,
size_nodedesc = sdfg.arrays[size_desc_name]
assert ("__return" not in size_desc_name)
ctypedef = size_nodedesc.dtype.ctype
array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
array = [v for v in sdfg.arrays.values() if type(v) == data.Array and v.size_desc_name is not None and v.size_desc_name == size_desc_name]
assert len(array) <= 1
if len(array) == 1:
array = array[0]
Expand Down
4 changes: 2 additions & 2 deletions dace/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def _transient_setter(self, value):
default=dtypes.AllocationLifetime.Scope)
location = DictProperty(key_type=str, value_type=str, desc='Full storage location identifier (e.g., rank, GPU ID)')
debuginfo = DebugInfoProperty(allow_none=True)
size_desc_name = Property(dtype=str, default=None, allow_none=True)


def __init__(self, dtype, shape, transient, storage, location, lifetime, debuginfo):
self.dtype = dtype
Expand All @@ -193,7 +193,6 @@ def __init__(self, dtype, shape, transient, storage, location, lifetime, debugin
self.location = location if location is not None else {}
self.lifetime = lifetime
self.debuginfo = debuginfo
self.size_desc_name = None
self._validate()

def __call__(self):
Expand Down Expand Up @@ -1387,6 +1386,7 @@ class Array(Data):
'it is inferred by other properties and the OptionalArrayInference pass.')
pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred')

size_desc_name = Property(dtype=str, default=None, allow_none=True, desc="Name of the size desc, not None only for reallocatable storage types that are also transient")
is_size_array = Property(dtype=bool, default=False, desc='Special array that is used to track the size of an another array')
is_deferred_array = Property(dtype=bool, default=False, desc='Array that requires deferred allocation')

Expand Down
6 changes: 6 additions & 0 deletions dace/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ class ScheduleType(aenum.AutoNumberEnum):
StorageType.FPGA_ShiftRegister,
]

REALLOCATABLE_STORAGES = [
StorageType.CPU_Heap,
StorageType.CPU_Pinned,
StorageType.GPU_Global,
]


@undefined_safe_enum
class ReductionType(aenum.AutoNumberEnum):
Expand Down
32 changes: 12 additions & 20 deletions tests/deferred_alloc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def test_conditional_alloc_cpu_pinned():
size2 = numpy.array([22, 22]).astype(numpy.uint64)
arr = numpy.array([-1.0]).astype(numpy.float32)
sdfg(path=1, size1=size1, size2=size2, example_array=arr)
assert ( arr.get()[0] == 3.0 )
assert ( arr[0] == 3.0 )

def test_conditional_alloc_cpu():
sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore)
Expand Down Expand Up @@ -403,7 +403,7 @@ def test_conditional_alloc_with_expr_cpu_pinned():
size2 = numpy.array([22, 22]).astype(numpy.uint64)
arr = numpy.array([-1.0]).astype(numpy.float32)
sdfg(path=1, size1=size1, size2=size2, example_array=arr)
assert ( arr.get()[0] == 3.0 )
assert ( arr[0] == 3.0 )

def test_conditional_alloc_with_expr_cpu():
sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore, True)
Expand Down Expand Up @@ -443,32 +443,24 @@ def test_incomplete_write_dimensions_2():

print(f"Trivial Realloc with storage, cpu")
test_trivial_realloc_cpu(True)
print(f"Trivial Realloc-Use with storage, cpu")
test_realloc_use_cpu(True)
print(f"Trivial Realloc within map, cpu pinned")
test_realloc_use_cpu_pinned(True)

print(f"Trivial Realloc with storage, gpu")
test_trivial_realloc_gpu(True)
print(f"Trivial Realloc-Use with storage, gpu")
test_realloc_use_gpu(True)
print(f"Trivial Realloc-Use with storage, cpu pinned")
test_realloc_use_cpu_pinned(True)
print(f"Trivial Realloc with storage, cpu pinned")
test_trivial_realloc_cpu_pinned(True)

print(f"Trivial Realloc with storage, cpu, on non-transient data")
test_trivial_realloc_cpu(False)
print(f"Trivial Realloc-Use with storage, cpu, on non-transient data")
test_realloc_use_cpu(False)
print(f"Trivial Realloc with storage, gpu, on non-transient data")
test_trivial_realloc_gpu(False)

print(f"Trivial Realloc-Use with storage, gpu, on non-transient data")
test_realloc_use_gpu(False)
test_trivial_realloc_gpu(False)
print(f"Trivial Realloc with storage, cpu pinned, on non-transient data")
test_trivial_realloc_cpu_pinned(False)
print(f"Trivial Realloc-Use with storage, cpu pinned, on non-transient data")
test_realloc_use_cpu_pinned(False)

print(f"Trivial Realloc-Use with storage, cpu")
test_realloc_use_cpu(True)
print(f"Trivial Realloc-Use with storage, gpu")
test_realloc_use_gpu(True)
print(f"Trivial Realloc-Use with storage, cpu pinned")
test_realloc_use_cpu_pinned(True)
print(f"Realloc with incomplete write one, validation")
test_incomplete_write_dimensions_1()
print(f"Realloc with incomplete write two, validation")
Expand All @@ -486,4 +478,4 @@ def test_incomplete_write_dimensions_2():
print(f"Test conditional alloc with use and the shape as a non-trivial expression, gpu")
test_conditional_alloc_with_expr_gpu()
print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu pinned")
test_conditional_alloc_with_expr_cpu_pinned()
test_conditional_alloc_with_expr_cpu_pinned()

0 comments on commit bb04e1a

Please sign in to comment.