diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml index ef8e5348da..2d6d42514f 100644 --- a/.github/workflows/fpga-ci.yml +++ b/.github/workflows/fpga-ci.yml @@ -16,7 +16,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-ci') }} runs-on: [self-hosted, linux, intel-fpga, xilinx-fpga] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml index faf0a727be..cde07f0406 100644 --- a/.github/workflows/general-ci.yml +++ b/.github/workflows/general-ci.yml @@ -18,11 +18,11 @@ jobs: simplify: [0,1,autoopt] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -55,7 +55,7 @@ jobs: else export DACE_optimizer_automatic_simplification=${{ matrix.simplify }} fi - pytest -n auto --cov-report=xml --cov=dace --tb=short -m "not gpu and not verilator and not tensorflow and not mkl and not sve and not papi and not mlir and not lapack and not fpga and not mpi and not rtl_hardware and not scalapack and not datainstrument" + pytest -n auto --cov-report=xml --cov=dace --tb=short -m "not gpu and not verilator and not tensorflow and not mkl and not sve and not papi and not mlir and not lapack and not fpga and not mpi and not rtl_hardware and not scalapack and not datainstrument and not long" ./codecov - name: Test OpenBLAS LAPACK diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 527e004478..b3af9c8c05 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -19,7 +19,7 @@ jobs: if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')" runs-on: [self-hosted, gpu] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/hardware_test.yml b/.github/workflows/hardware_test.yml index 3fe32aaab7..e319c72587 100644 --- a/.github/workflows/hardware_test.yml +++ b/.github/workflows/hardware_test.yml @@ -4,7 +4,7 @@ jobs: test-rtl: runs-on: [self-hosted, linux, xilinx-fpga] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml index 99b566e21f..62887ad208 100644 --- a/.github/workflows/heterogeneous-ci.yml +++ b/.github/workflows/heterogeneous-ci.yml @@ -19,7 +19,7 @@ jobs: if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')" runs-on: [self-hosted, linux] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: 'recursive' - name: Install dependencies diff --git a/.github/workflows/pyFV3-ci.yml b/.github/workflows/pyFV3-ci.yml index f58fdf85ac..852b887cdb 100644 --- a/.github/workflows/pyFV3-ci.yml +++ b/.github/workflows/pyFV3-ci.yml @@ -21,18 +21,18 @@ jobs: python-version: [3.11.7] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: repository: 'NOAA-GFDL/PyFV3' ref: 'ci/DaCe' submodules: 'recursive' path: 'pyFV3' - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: path: 'dace' submodules: 'recursive' - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install library dependencies @@ -53,11 +53,11 @@ jobs: cd pyFV3 mkdir -p test_data cd test_data - wget https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.D_SW.tar.gz + wget --retry-connrefused https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.D_SW.tar.gz tar -xzvf 8.1.3_c12_6ranks_standard.D_SW.tar.gz - wget https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.RiemSolver3.tar.gz + wget --retry-connrefused https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.RiemSolver3.tar.gz tar -xzvf 8.1.3_c12_6ranks_standard.RiemSolver3.tar.gz - wget https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.Remapping.tar.gz + wget --retry-connrefused https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.Remapping.tar.gz tar -xzvf 8.1.3_c12_6ranks_standard.Remapping.tar.gz cd ../.. # Clean up caches between run for stale un-expanded SDFG to trip the build system (NDSL side issue) diff --git a/.github/workflows/verilator_compatibility.yml b/.github/workflows/verilator_compatibility.yml index 7f43565812..dce0c9b1fb 100644 --- a/.github/workflows/verilator_compatibility.yml +++ b/.github/workflows/verilator_compatibility.yml @@ -17,14 +17,14 @@ jobs: steps: - name: trigger reason run: echo "Trigger Reason:" ${{ github.event.inputs.reason }} - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: checkout submodules run: git submodule update --init --recursive - name: install apt packages run: sudo apt-get update && sudo apt-get -y install git make autoconf g++ flex bison libfl2 libfl-dev - name: compile verilator run: git clone https://github.com/verilator/verilator.git && cd verilator && git fetch origin && if [ ! "${{ matrix.verilator_version }}" == "master" ]; then git checkout v${{ matrix.verilator_version }}; fi && autoconf && ./configure && make -j2 && sudo make install - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: python-version: '3.8' architecture: 'x64' diff --git a/dace/cli/dacelab.py b/dace/cli/dacelab.py index 27a3215e09..647ec31a3d 100644 --- a/dace/cli/dacelab.py +++ b/dace/cli/dacelab.py @@ -2,11 +2,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import argparse -import numpy -import pickle -import json - -import dace from dace.frontend.octave import parse from dace.sdfg.nodes import AccessNode diff --git a/dace/cli/sdfg_diff.py b/dace/cli/sdfg_diff.py index 9c40e59f10..ed661b8c25 100644 --- a/dace/cli/sdfg_diff.py +++ b/dace/cli/sdfg_diff.py @@ -7,9 +7,12 @@ import os import platform import tempfile -from typing import Dict, Literal, Set, Tuple, Union +from typing import Dict, Set, Tuple, Union +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal -import jinja2 import dace from dace import memlet as mlt from dace.sdfg import nodes as nd @@ -179,6 +182,11 @@ def main(): diff_sets = _sdfg_diff(sdfg_A, sdfg_B, eq_strategy) if args.graphical: + try: + import jinja2 + except (ImportError, ModuleNotFoundError): + raise ImportError('Graphical SDFG diff requires jinja2, please install by running `pip install jinja2`') + basepath = os.path.join(os.path.dirname(os.path.realpath(dace.__file__)), 'viewer') template_loader = jinja2.FileSystemLoader(searchpath=os.path.join(basepath, 'templates')) template_env = jinja2.Environment(loader=template_loader) diff --git a/dace/cli/sdfgcc.py b/dace/cli/sdfgcc.py index 1df7604b4b..0d04950be7 100644 --- a/dace/cli/sdfgcc.py +++ b/dace/cli/sdfgcc.py @@ -48,7 +48,7 @@ def main(): sdfg = SDFGOptimizer(sdfg).optimize() # Compile SDFG - sdfg.compile(outpath) + sdfg.compile(outpath, return_program_handle=False) # Copying header file to optional path if outpath is not None: diff --git a/dace/cli/sdfv.py b/dace/cli/sdfv.py index 49255a1e7e..d14059468f 100644 --- a/dace/cli/sdfv.py +++ b/dace/cli/sdfv.py @@ -13,7 +13,6 @@ import dace import tempfile -import jinja2 def partialclass(cls, *args, **kwds): @@ -44,10 +43,19 @@ def view(sdfg: dace.SDFG, filename: Optional[Union[str, int]] = None, verbose: b ): fd, filename = tempfile.mkstemp(suffix='.sdfg') sdfg.save(filename) - os.system(f'code {filename}') + if platform.system() == 'Darwin': + # Special case for MacOS + os.system(f'open {filename}') + else: + os.system(f'code {filename}') os.close(fd) return + try: + import jinja2 + except (ImportError, ModuleNotFoundError): + raise ImportError('SDFG.view() requires jinja2, please install by running `pip install jinja2`') + if type(sdfg) is dace.SDFG: sdfg = dace.serialize.dumps(sdfg.to_json()) diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index 9bfcc439e0..bae8531e62 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -518,6 +518,9 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: # Otherwise, None values are passed as null pointers below elif isinstance(arg, ctypes._Pointer): pass + elif isinstance(arg, str): + # Cast to bytes + arglist[i] = ctypes.c_char_p(arg.encode('utf-8')) else: raise TypeError(f'Passing an object (type {type(arg).__name__}) to an array in argument "{a}"') elif is_array and not is_dtArray: @@ -550,6 +553,8 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: pass elif isinstance(arg, float) and atype.dtype.type == np.float64: pass + elif isinstance(arg, bool) and atype.dtype.type == np.bool_: + pass elif (isinstance(arg, str) or arg is None) and atype.dtype == dtypes.string: if arg is None: arglist[i] = ctypes.c_char_p(None) @@ -575,7 +580,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: arg_ctypes = tuple(at.dtype.as_ctypes() for at in argtypes) constants = self.sdfg.constants - callparams = tuple((actype(arg.get()) if isinstance(arg, symbolic.symbol) else arg, actype, atype, aname) + callparams = tuple((arg, actype, atype, aname) for arg, actype, atype, aname in zip(arglist, arg_ctypes, argtypes, argnames) if not (symbolic.issymbolic(arg) and (hasattr(arg, 'name') and arg.name in constants))) diff --git a/dace/codegen/compiler.py b/dace/codegen/compiler.py index 350e141606..927c59d19d 100644 --- a/dace/codegen/compiler.py +++ b/dace/codegen/compiler.py @@ -13,6 +13,7 @@ import subprocess import re from typing import Any, Callable, Dict, List, Set, Tuple, TypeVar, Union +import warnings import dace from dace.config import Config @@ -57,6 +58,18 @@ def generate_program_folder(sdfg, code_objects: List[CodeObject], out_path: str, code_path = os.path.join(target_folder, basename) clean_code = code_object.clean_code + if Config.get_bool('compiler', 'format_code'): + config_file = Config.get('compiler', 'format_config_file') + if config_file is not None and config_file != "": + run_arg_list = ['clang-format', f"-style=file:{config_file}"] + else: + run_arg_list = ['clang-format'] + result = subprocess.run(run_arg_list, input=clean_code, text=True, capture_output=True) + if result.returncode or result.stderr: + warnings.warn(f'clang-format failed to run: {result.stderr}') + else: + clean_code = result.stdout + # Save the file only if it changed (keeps old timestamps and saves # build time) if not identical_file_exists(code_path, clean_code): @@ -213,7 +226,7 @@ def configure_and_compile(program_folder, program_name=None, output_stream=None) # Clean CMake directory and try once more if Config.get_bool('debugprint'): print('Cleaning CMake build folder and retrying...') - shutil.rmtree(build_folder) + shutil.rmtree(build_folder, ignore_errors=True) os.makedirs(build_folder) try: _run_liveoutput(cmake_command, shell=True, cwd=build_folder, output_stream=output_stream) @@ -260,7 +273,7 @@ def get_environment_flags(environments) -> Tuple[List[str], Set[str]]: """ Returns the CMake environment and linkage flags associated with the given input environments/libraries. - + :param environments: A list of ``@dace.library.environment``-decorated classes. :return: A 2-tuple of (environment CMake flags, linkage CMake flags) diff --git a/dace/codegen/cppunparse.py b/dace/codegen/cppunparse.py index 18ee00721b..e5e5a57f09 100644 --- a/dace/codegen/cppunparse.py +++ b/dace/codegen/cppunparse.py @@ -349,6 +349,8 @@ def _Assign(self, t): # if the veclen is greater than one, this should be defined with a vector data type self.write("{}{} ".format(dace.dtypes._OCL_VECTOR_TYPES[inferred_type.type], inferred_type.veclen)) + elif self.language == dace.dtypes.Language.OpenCL: + self.write(dace.dtypes._OCL_TYPES[inferred_type.type] + " ") else: self.write(dace.dtypes._CTYPES[inferred_type.type] + " ") else: @@ -555,7 +557,11 @@ def _write_constant(self, value): if result.find("b'") >= 0: self.write(result) else: - self.write(result.replace('\'', '\"')) + towrite = result + if result.startswith("'"): + towrite = result[1:-1].replace('"', '\\"') + towrite = f'"{towrite}"' + self.write(towrite) def _Constant(self, t): value = t.value @@ -749,6 +755,8 @@ def _Num(self, t): # For complex values, use ``dtype_to_typeclass`` if isinstance(t_n, complex): dtype = dtypes.dtype_to_typeclass(complex) + repr_n = f'{dtype}({t_n.real}, {t_n.imag})' + # Handle large integer values if isinstance(t_n, int): @@ -765,10 +773,8 @@ def _Num(self, t): elif bits >= 64: warnings.warn(f'Value wider than 64 bits encountered in expression ({t_n}), emitting as-is') - if repr_n.endswith("j"): - self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1])) - else: - self.write(repr_n.replace("inf", INFSTR)) + repr_n = repr_n.replace("inf", INFSTR) + self.write(repr_n) def _List(self, t): raise NotImplementedError('Invalid C++') @@ -1187,6 +1193,8 @@ def py2cpp(code, expr_semicolon=True, defined_symbols=None): return cppunparse(ast.parse(symbolic.symstr(code, cpp_mode=True)), expr_semicolon, defined_symbols=defined_symbols) + elif isinstance(code, int): + return str(code) elif code.__class__.__name__ == 'function': try: code_str = inspect.getsource(code) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index c34c829c31..89239abcb3 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -26,7 +26,7 @@ from dace.frontend import operations from dace.frontend.python import astutils from dace.frontend.python.astutils import ExtNodeTransformer, rname, unparse -from dace.sdfg import nodes, graph as gr, utils +from dace.sdfg import nodes, graph as gr, utils, propagation from dace.properties import LambdaProperty from dace.sdfg import SDFG, is_devicelevel_gpu, SDFGState from dace.codegen.targets import fpga @@ -417,9 +417,10 @@ def reshape_strides(subset, strides, original_strides, copy_shape): dims = len(copy_shape) reduced_tile_sizes = [ts for ts, s in zip(subset.tile_sizes, original_copy_shape) if s != 1] + reduced_tile_sizes += [1] * (dims - len(reduced_tile_sizes)) # Pad the remainder with 1s to maintain dimensions. reshaped_copy = copy_shape + [ts for ts in subset.tile_sizes if ts != 1] - reshaped_copy[:len(copy_shape)] = [s / ts for s, ts in zip(copy_shape, reduced_tile_sizes)] + reshaped_copy[:len(copy_shape)] = [s // ts for s, ts in zip(copy_shape, reduced_tile_sizes)] new_strides = [0] * len(reshaped_copy) elements_remaining = functools.reduce(sp.Mul, copy_shape, 1) @@ -712,6 +713,31 @@ def _check_map_conflicts(map, edge): return True +def _check_neighbor_conflicts(dfg, edge): + """ + Checks for other memlets writing to edges that may overlap in subsets. + + Returns True if there are no conflicts, False if there may be. + """ + outer = propagation.propagate_memlet(dfg, edge.data, edge.dst, False) + siblings = dfg.in_edges(edge.dst) + for sibling in siblings: + if sibling is edge: + continue + if sibling.data.data != edge.data.data: + continue + # Check if there is definitely no overlap in the propagated memlet + sibling_outer = propagation.propagate_memlet(dfg, sibling.data, edge.dst, False) + if subsets.intersects(outer.subset, sibling_outer.subset) == False: + # In that case, continue + continue + + # Other cases are indeterminate and will be atomic + return False + # No overlaps in current scope + return True + + def write_conflicted_map_params(map, edge): result = [] for itervar, (_, _, mapskip) in zip(map.params, map.range): @@ -768,6 +794,8 @@ def is_write_conflicted_with_reason(dfg, edge, datanode=None, sdfg_schedule=None for e in path: if (isinstance(e.dst, nodes.ExitNode) and (e.dst.map.schedule != dtypes.ScheduleType.Sequential and e.dst.map.schedule != dtypes.ScheduleType.Snitch)): + if not _check_neighbor_conflicts(dfg, e): + return e.dst if _check_map_conflicts(e.dst.map, e): # This map is parallel w.r.t. WCR # print('PAR: Continuing from map') @@ -1037,6 +1065,16 @@ def _Name(self, t: ast.Name): desc = self.sdfg.arrays[t.id] self.write(ptr(t.id, desc, self.sdfg, self.codegen)) + def _Attribute(self, t: ast.Attribute): + from dace.frontend.python.astutils import rname + name = rname(t) + if name not in self.sdfg.arrays: + return super()._Attribute(t) + + # Replace values with their code-generated names (for example, persistent arrays) + desc = self.sdfg.arrays[name] + self.write(ptr(name, desc, self.sdfg, self.codegen)) + def _Subscript(self, t: ast.Subscript): from dace.frontend.python.astutils import subscript_to_slice target, rng = subscript_to_slice(t, self.sdfg.arrays) @@ -1142,7 +1180,7 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT return sum(symbolic.pystr_to_symbolic(unparse(elt)) * s for elt, s in zip(elts, strides)) if len(strides) != 1: - raise SyntaxError('Missing dimensions in expression (expected %d, got one)' % len(strides)) + raise SyntaxError('Missing dimensions in expression (expected one, got %d)' % len(strides)) try: return symbolic.pystr_to_symbolic(unparse(visited_slice)) * strides[0] @@ -1279,8 +1317,7 @@ def visit_Name(self, node: ast.Name): if memlet.data in self.sdfg.arrays and self.sdfg.arrays[memlet.data].dtype == dtype: return self.generic_visit(node) return ast.parse(f"{name}[0]").body[0].value - elif (self.allow_casts and (defined_type in (DefinedType.Stream, DefinedType.StreamArray)) - and memlet.dynamic): + elif (self.allow_casts and (defined_type in (DefinedType.Stream, DefinedType.StreamArray)) and memlet.dynamic): return ast.parse(f"{name}.pop()").body[0].value else: return self.generic_visit(node) @@ -1314,8 +1351,8 @@ def visit_BinOp(self, node: ast.BinOp): evaluated_constant = symbolic.evaluate(unparsed, self.constants) evaluated = symbolic.symstr(evaluated_constant, cpp_mode=True) value = ast.parse(evaluated).body[0].value - if isinstance(evaluated_node, numbers.Number) and evaluated_node != ( - value.value if sys.version_info >= (3, 8) else value.n): + if isinstance(evaluated_node, numbers.Number) and evaluated_node != (value.value if sys.version_info >= + (3, 8) else value.n): raise TypeError node.right = ast.parse(evaluated).body[0].value except (TypeError, AttributeError, NameError, KeyError, ValueError, SyntaxError): @@ -1328,6 +1365,10 @@ def visit_Attribute(self, node): attrname = rname(node) module_name = attrname[:attrname.rfind(".")] func_name = attrname[attrname.rfind(".") + 1:] + if module_name == 'dace' and isinstance(getattr(dace, func_name, False), dtypes.typeclass): + # A type definition + dtype: dtypes.typeclass = getattr(dace, func_name) + return ast.copy_location(ast.Name(id=dtype.ctype, ctx=ast.Load), node) if module_name in dtypes._ALLOWED_MODULES: cppmodname = dtypes._ALLOWED_MODULES[module_name] return ast.copy_location(ast.Name(id=(cppmodname + func_name), ctx=ast.Load), node) @@ -1368,8 +1409,8 @@ def visit_Call(self, node): # TODO: This should be in the CUDA code generator. Add appropriate conditions to node dispatch predicate -def presynchronize_streams(sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.Node, callsite_stream: CodeIOStream): +def presynchronize_streams(sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, + callsite_stream: CodeIOStream): state_dfg: SDFGState = cfg.nodes()[state_id] if hasattr(node, "_cuda_stream") or is_devicelevel_gpu(sdfg, state_dfg, node): return diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 51daaa432b..9ba202757e 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -406,8 +406,12 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer self._dispatcher.declared_arrays.add(f"{name}->{k}", defined_type, ctypedef) - self.allocate_array(sdfg, cfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, - function_stream, declaration_stream, allocation_stream) + if isinstance(v, data.Scalar): + # NOTE: Scalar members are already defined in the struct definition. + self._dispatcher.defined_vars.add(f"{name}->{k}", defined_type, ctypedef) + else: + self.allocate_array(sdfg, cfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, + function_stream, declaration_stream, allocation_stream) return if isinstance(nodedesc, data.View): return self.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, @@ -1841,7 +1845,7 @@ def _generate_MapEntry( # Define all input connectors of this map entry for e in dynamic_map_inputs(state_dfg, node): - if e.data.data != e.dst_conn: + if cpp.ptr(e.data.data, sdfg.arrays[e.data.data], sdfg, self._frame) != e.dst_conn: callsite_stream.write( self.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), cfg, state_id, node) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index f080f2cc62..1cf8919d74 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -23,8 +23,8 @@ from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute from dace.config import Config from dace.frontend import operations -from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, - is_array_stream_view, is_devicelevel_gpu, nodes, scope_contains_scope) +from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view, + is_devicelevel_gpu, nodes, scope_contains_scope) from dace.sdfg import utils as sdutil from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import ControlFlowRegion, StateSubgraphView @@ -68,6 +68,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): dispatcher = self._dispatcher self.create_grid_barrier = False + self.dynamic_tbmap_type = None self.extra_nsdfg_args = [] CUDACodeGen._in_device_code = False self._cpu_codegen: Optional['CPUCodeGen'] = None @@ -892,8 +893,8 @@ def increment(streams): return max_streams, max_events - def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, - dst_node: nodes.Node, dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, + def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, dst_node: nodes.Node, + dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None: u, uconn, v, vconn, memlet = edge @@ -1163,11 +1164,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St copysize=', '.join(_topy(copy_shape)), is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', accum=accum or '::Copy', - args=', '.join( - [src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction - ) - ), - cfg, state_id, [src_node, dst_node]) + args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + + custom_reduction)), cfg, state_id, [src_node, dst_node]) else: callsite_stream.write( (' {func}<{type}, {bdims}, {copysize}, ' + @@ -1236,8 +1234,12 @@ def _begin_streams(self, sdfg, state): result.add(e.dst._cuda_stream) return result - def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, - function_stream: CodeIOStream, callsite_stream: CodeIOStream, + def generate_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, generate_state_footer: bool = False) -> None: # Two modes: device-level state and if this state has active streams if CUDACodeGen._in_device_code: @@ -1361,8 +1363,7 @@ def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: "&& threadIdx.x == 0) " "{ // sub-graph begin", cfg, state.block_id) elif write_scope == 'block': - callsite_stream.write("if (threadIdx.x == 0) " - "{ // sub-graph begin", cfg, state.block_id) + callsite_stream.write("if (threadIdx.x == 0) " "{ // sub-graph begin", cfg, state.block_id) else: callsite_stream.write("{ // subgraph begin", cfg, state.block_id) else: @@ -1985,16 +1986,13 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # allocating shared memory for dynamic threadblock maps if has_dtbmap: - kernel_stream.write( - '__shared__ dace::' - 'DynamicMap<{fine_grained}, {block_size}>' - '::shared_type dace_dyn_map_shared;'.format( - fine_grained=('true' - if Config.get_bool('compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), - block_size=functools.reduce( - (lambda x, y: x * y), - [int(x) for x in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')])), cfg, - state_id, node) + self.dynamic_tbmap_type = ( + f'dace::DynamicMap<{"true" if Config.get_bool("compiler", "cuda", "dynamic_map_fine_grained") else "false"}, ' + f'{functools.reduce((lambda x, y: x * y), [int(x) for x in Config.get("compiler", "cuda", "dynamic_map_block_size").split(",")])}>' + '::shared_type') + kernel_stream.write(f'__shared__ {self.dynamic_tbmap_type} dace_dyn_map_shared;', cfg, state_id, node) + else: + self.dynamic_tbmap_type = None # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) @@ -2072,8 +2070,8 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # Generate conditions for this block's execution using min and max # element, e.g., skipping out-of-bounds threads in trailing block - # unless thsi is handled by another map down the line - if (not has_tbmap and not has_dtbmap and node.map.schedule != dtypes.ScheduleType.GPU_Persistent): + # unless this is handled by another map down the line + if ((not has_tbmap or has_dtbmap) and node.map.schedule != dtypes.ScheduleType.GPU_Persistent): dsym_end = [d + bs - 1 for d, bs in zip(dsym, self._block_dims)] minels = krange.min_element() maxels = krange.max_element() @@ -2090,10 +2088,12 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S condition += '%s < %s' % (v, _topy(maxel + 1)) if len(condition) > 0: self._kernel_grid_conditions.append(f'if ({condition}) {{') - kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) + if not has_dtbmap: + kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) else: self._kernel_grid_conditions.append('{') - kernel_stream.write('{', cfg, state_id, scope_entry) + if not has_dtbmap: + kernel_stream.write('{', cfg, state_id, scope_entry) self._dispatcher.dispatch_subgraph(sdfg, cfg, @@ -2112,6 +2112,7 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S self._kernel_state = None CUDACodeGen._in_device_code = False self._grid_dims = None + self.dynamic_tbmap_type = None def get_next_scope_entries(self, dfg, scope_entry): parent_scope_entry = dfg.entry_node(scope_entry) @@ -2179,10 +2180,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco current_sdfg = current_state.parent if not outer_scope: raise ValueError(f'Failed to find the outer scope of {scope_entry}') - callsite_stream.write( - 'if ({} < {}) {{'.format(outer_scope.map.params[0], - _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), cfg, - state_id, scope_entry) + for cond in self._kernel_grid_conditions: + callsite_stream.write(cond, cfg, state_id, scope_entry) # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule. # They define inside the schedule the bounds of the any nested Maps. @@ -2205,8 +2204,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco '__dace_dynmap_begin = {begin};\n' '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), cfg, state_id, scope_entry) - # close if - callsite_stream.write('}', cfg, state_id, scope_entry) + # Close kernel grid conditions + for _ in self._kernel_grid_conditions: + callsite_stream.write('}', cfg, state_id, scope_entry) callsite_stream.write( 'dace::DynamicMap<{fine_grained}, {bsize}>::' @@ -2215,7 +2215,7 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco 'auto {param}) {{'.format(fine_grained=('true' if Config.get_bool( 'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), bsize=total_block_size, - kmapIdx=outer_scope.map.params[0], + kmapIdx=outer_scope.map.params[-1], param=dynmap_var), cfg, state_id, scope_entry) for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry): @@ -2556,8 +2556,8 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco for cond in self._kernel_grid_conditions: callsite_stream.write(cond, cfg, state_id, scope_entry) - def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: if self.node_dispatch_predicate(sdfg, dfg, node): # Dynamically obtain node generator according to class name gen = getattr(self, '_generate_' + type(node).__name__, False) @@ -2594,6 +2594,8 @@ def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) if self.create_grid_barrier: result.append(('cub::GridBarrier&', '__gbar', '__gbar')) + if self.dynamic_tbmap_type: + result.append((f'{self.dynamic_tbmap_type}&', 'dace_dyn_map_shared', 'dace_dyn_map_shared')) # Add data from nested SDFGs to kernel arguments result.extend([(atype, aname, aname) for atype, aname, _ in self.extra_nsdfg_args]) diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py index 0c74d6ec07..61ba9f95ad 100644 --- a/dace/codegen/targets/fpga.py +++ b/dace/codegen/targets/fpga.py @@ -2112,7 +2112,11 @@ def _generate_MapEntry(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgr end_type = None if end_type is not None: if np.dtype(end_type.dtype.type) > np.dtype('uint32'): - loop_var_type = end_type.ctype + v = dace.config.Config.get("compiler", "fpga", "vendor") + if v.casefold() == 'intel_fpga'.casefold(): + loop_var_type = end_type.ocltype + else: + loop_var_type = end_type.ctype elif np.issubdtype(np.dtype(end_type.dtype.type), np.unsignedinteger): loop_var_type = "size_t" except (UnboundLocalError): diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index d71ea40fee..0b8fa739fe 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -947,7 +947,12 @@ def generate_code(self, if not is_top_level and isvarName in sdfg.parent_nsdfg_node.symbol_mapping: continue isvar = data.Scalar(isvarType) - callsite_stream.write('%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)), sdfg) + if (schedule in (dtypes.ScheduleType.FPGA_Device, dtypes.ScheduleType.FPGA_Multi_Pumped) + and config.Config.get('compiler', 'fpga', 'vendor').lower() == 'intel_fpga'): + # Emit OpenCL type + callsite_stream.write(f'{isvarType.ocltype} {isvarName};\n', sdfg) + else: + callsite_stream.write('%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)), sdfg) self.dispatcher.defined_vars.add(isvarName, disp.DefinedType.Scalar, isvarType.ctype) callsite_stream.write('\n', sdfg) diff --git a/dace/codegen/targets/intel_fpga.py b/dace/codegen/targets/intel_fpga.py index 513dc0bbfc..9437dccbe3 100644 --- a/dace/codegen/targets/intel_fpga.py +++ b/dace/codegen/targets/intel_fpga.py @@ -169,15 +169,16 @@ def get_generated_codeobjects(self): "cpp", IntelFPGACodeGen, "Intel FPGA", - target_type="host") + target_type="host", + sdfg=self._global_sdfg) kernel_code_objs = [ - CodeObject(kernel_name, code, "cl", IntelFPGACodeGen, "Intel FPGA", target_type="device") + CodeObject(kernel_name, code, "cl", IntelFPGACodeGen, "Intel FPGA", target_type="device", sdfg=self._global_sdfg) for (kernel_name, code, _) in self._kernel_codes ] # add the util header if present other_code_objs = [ - CodeObject(file_name, code.getvalue(), "cl", IntelFPGACodeGen, "Intel FPGA", target_type="device") + CodeObject(file_name, code.getvalue(), "cl", IntelFPGACodeGen, "Intel FPGA", target_type="device", sdfg=self._global_sdfg) for (file_name, code) in self._other_codes.items() ] @@ -299,8 +300,8 @@ def make_kernel_argument(self, data, var_name, is_output, with_vectorization): return "__global volatile {}* restrict {}".format(vec_type, var_name) elif isinstance(data, dace.data.Stream): return None # Streams are global objects - else: - return data.as_arg(with_types=True, name=var_name) + else: # Scalar or structure + return f'{data.dtype.ocltype} {var_name}' @staticmethod def generate_unroll_loop_pre(kernel_stream, factor, sdfg, cfg, state_id, node): @@ -570,8 +571,9 @@ def generate_module(self, sdfg, cfg, state, kernel_name, module_name, subgraph, arg = self.make_kernel_argument(p, pname, is_output, True) if arg is not None: - #change c type long long to opencl type long - arg = arg.replace("long long", "long") + #change c type to opencl type + if arg in dtypes._CTYPES_TO_OCLTYPES: + arg = dtypes._CTYPES_TO_OCLTYPES[arg] kernel_args_opencl.append(arg) kernel_args_host.append(p.as_arg(True, name=pname)) @@ -733,7 +735,7 @@ def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_referen arguments = [f'{atype} {aname}' for atype, aname, _ in memlet_references] fsyms = node.sdfg.used_symbols(all_symbols=False, keep_defined_in_mapping=True) arguments += [ - f'{node.sdfg.symbols[aname].as_arg(aname)}' for aname in sorted(node.symbol_mapping.keys()) + f'{node.sdfg.symbols[aname].ocltype} {aname}' for aname in sorted(node.symbol_mapping.keys()) if aname in fsyms and aname not in sdfg.constants ] arguments = ', '.join(arguments) @@ -769,8 +771,9 @@ def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): ptrname = cpp.ptr(in_memlet.data, desc, sdfg, self._frame) defined_type, defined_ctype = self._dispatcher.defined_vars.get(ptrname, 1) - #change c type long long to opencl type long - defined_ctype = defined_ctype.replace("long long", "long") + #change c type to opencl type + if defined_ctype in dtypes._CTYPES_TO_OCLTYPES: + defined_ctype = dtypes._CTYPES_TO_OCLTYPES[defined_ctype] if isinstance(desc, dace.data.Array) and (desc.storage == dtypes.StorageType.FPGA_Global or desc.storage == dtypes.StorageType.FPGA_Local): @@ -822,9 +825,9 @@ def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): ptrname = cpp.ptr(out_memlet.data, desc, sdfg, self._frame) defined_type, defined_ctype = self._dispatcher.defined_vars.get(ptrname, 1) - #change c type long long to opencl type long - if defined_ctype.__contains__("long long"): - defined_ctype = defined_ctype.replace("long long", "long") + #change c type to opencl type + if defined_ctype in dtypes._CTYPES_TO_OCLTYPES: + defined_ctype = dtypes._CTYPES_TO_OCLTYPES[defined_ctype] if isinstance(desc, dace.data.Array) and (desc.storage == dtypes.StorageType.FPGA_Global or desc.storage == dtypes.StorageType.FPGA_Local): @@ -908,7 +911,7 @@ def allocate_view(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: SDFGState, # derive the declaration/definition qualifier = "__global volatile " - atype = dtypes.pointer(nodedesc.dtype).ctype + " restrict" + atype = dtypes.pointer(nodedesc.dtype).ocltype + " restrict" aname = ptrname viewed_desc = sdfg.arrays[edge.data.data] eptr = cpp.ptr(edge.data.data, viewed_desc, sdfg, self._frame) @@ -1261,7 +1264,7 @@ def generate_constants(self, sdfg, callsite_stream): for cstname, (csttype, cstval) in sdfg.constants_prop.items(): if isinstance(csttype, dace.data.Array): - const_str = "__constant " + csttype.dtype.ctype + \ + const_str = "__constant " + csttype.dtype.ocltype + \ " " + cstname + "[" + str(cstval.size) + "]" if cstname not in self.generated_constants: diff --git a/dace/codegen/tools/type_inference.py b/dace/codegen/tools/type_inference.py index 893866522f..26b369fa9d 100644 --- a/dace/codegen/tools/type_inference.py +++ b/dace/codegen/tools/type_inference.py @@ -9,7 +9,7 @@ import numpy as np import ast -from dace import dtypes +from dace import data, dtypes from dace import symbolic from dace.codegen import cppunparse from dace.symbolic import symbol, SymExpr, symstr @@ -286,6 +286,8 @@ def _Name(t, symbols, inferred_symbols): inferred_type = dtypes.typeclass(inferred_type.type) elif isinstance(inferred_type, symbolic.symbol): inferred_type = inferred_type.dtype + elif isinstance(inferred_type, data.Data): + inferred_type = inferred_type.dtype elif t_id in inferred_symbols: inferred_type = inferred_symbols[t_id] return inferred_type @@ -373,6 +375,8 @@ def _Compare(t, symbols, inferred_symbols): for o, e in zip(t.ops, t.comparators): if o.__class__.__name__ not in cppunparse.CPPUnparser.cmpops: continue + if isinstance(e, ast.Constant) and e.value is None: + continue inf_type = _dispatch(e, symbols, inferred_symbols) if isinstance(inf_type, dtypes.vector): # Make sure all occuring vectors are of same size diff --git a/dace/config_schema.yml b/dace/config_schema.yml index da35e61997..b5a7914018 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -173,6 +173,20 @@ required: The typename of this struct is derived by appending this value to the SDFG's name. Note that the suffix may only contains letters, digits and underscores. + format_code: + type: bool + default: false + title: Format code with `clang-format` + description: > + Formats the generated code with `clang-format` before saving the files. + + format_config_file: + type: str + default: "" + title: Path the clang-format file + description: > + Clang-format file to be used by clang-format, only used if format_code is true + default_data_types: type : str default: Python @@ -474,7 +488,7 @@ required: title: Detect parts of an SDFG that can run in parallel description: > If set to false, DaCe will place each weakly connected - component found in an SDFG state in a different Kernel/Processing Element. + component found in an SDFG state in a different Kernel/Processing Element. If true, a heuristic will further inspect each independent component for other parallelism opportunities (e.g., branches of the SDFG that can be executed in parallel), creating the corresponding kernels. @@ -800,7 +814,7 @@ required: default: false title: Check arguments on SDFG call description: > - Perform an early type check on arguments passed to an SDFG when called directly (from + Perform an early type check on arguments passed to an SDFG when called directly (from ``SDFG.__call__``). Another type check is performed when calling compiled SDFGs. avoid_wcr: @@ -835,18 +849,18 @@ required: title: Compiled cache entry naming policy description: > Determine the name of the generated ``.dacecache`` folder: - - + + * ``name`` uses the name of the SDFG directly, causing it to be overridden by other programs using the same SDFG name. - + * ``hash`` uses a mangled name based on the hash of the SDFG, such that any change to the SDFG will generate a different cache folder. - + * ``unique`` uses a name based on the currently running Python process at code generation time, such that no caching or clashes can happen between different processes or subsequent invocations of Python. - + * ``single`` uses a single cache folder for all SDFGs, saving space and potentially build time, but disallows executing SDFGs in parallel and caching of more than one simultaneous SDFG. @@ -919,6 +933,13 @@ required: description: > Check for undefined symbols in memlets during SDFG validation. + check_race_conditions: + type: bool + default: false + title: Check race conditions + description: > + Check for potential race conditions during validation. + ############################################# # Features for unit testing diff --git a/dace/data.py b/dace/data.py index a07fe42083..9749411fe6 100644 --- a/dace/data.py +++ b/dace/data.py @@ -167,9 +167,16 @@ class Data: Examples: Arrays, Streams, custom arrays (e.g., sparse matrices). """ + def _transient_setter(self, value): + self._transient = value + if isinstance(self, Structure): + for _, v in self.members.items(): + if isinstance(v, Data): + v.transient = value + dtype = TypeClassProperty(default=dtypes.int32, choices=dtypes.Typeclasses) shape = ShapeProperty(default=[]) - transient = Property(dtype=bool, default=False) + transient = Property(dtype=bool, default=False, setter=_transient_setter) storage = EnumProperty(dtype=dtypes.StorageType, desc="Storage location", default=dtypes.StorageType.Default) lifetime = EnumProperty(dtype=dtypes.AllocationLifetime, desc='Data allocation span', diff --git a/dace/distr_types.py b/dace/distr_types.py index 1b595a1b84..b60eb4925e 100644 --- a/dace/distr_types.py +++ b/dace/distr_types.py @@ -96,6 +96,10 @@ def _validate(self): raise ValueError('Color must have only logical true (1) or false (0) values.') return True + @property + def dtype(self): + return type(self) + def to_json(self): attrs = serialize.all_properties_to_json(self) retdict = {"type": type(self).__name__, "attributes": attrs} diff --git a/dace/dtypes.py b/dace/dtypes.py index c5f9bb4732..465e73b2b1 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -1,10 +1,8 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ A module that contains various DaCe type definitions. """ -from __future__ import print_function import ctypes import aenum import inspect -import itertools import numpy import re from collections import OrderedDict @@ -252,12 +250,12 @@ class TilingType(aenum.AutoNumberEnum): numpy.int16: "short", numpy.int32: "int", numpy.intc: "int", - numpy.int64: "long long", - numpy.uint8: "unsigned char", - numpy.uint16: "unsigned short", - numpy.uint32: "unsigned int", - numpy.uintc: "unsigned int", - numpy.uint64: "unsigned long long", + numpy.int64: "int64_t", + numpy.uint8: "uint8_t", + numpy.uint16: "uint16_t", + numpy.uint32: "uint32_t", + numpy.uintc: "dace::uint", + numpy.uint64: "uint64_t", numpy.float16: "dace::float16", numpy.float32: "float", numpy.float64: "double", @@ -277,17 +275,37 @@ class TilingType(aenum.AutoNumberEnum): numpy.int32: "int", numpy.intc: "int", numpy.int64: "long", - numpy.uint8: "unsigned char", - numpy.uint16: "unsigned short", - numpy.uint32: "unsigned int", - numpy.uint64: "unsigned long", - numpy.uintc: "unsigned int", + numpy.uint8: "uchar", + numpy.uint16: "ushort", + numpy.uint32: "uint", + numpy.uint64: "ulong", + numpy.uintc: "uint", numpy.float32: "float", numpy.float64: "double", numpy.complex64: "complex float", numpy.complex128: "complex double", } +_CTYPES_TO_OCLTYPES = { + "void": "void", + "int": "int", + "float": "float", + "double": "double", + "dace::complex64": "complex float", + "dace::complex128": "complex double", + "bool": "bool", + "char": "char", + "short": "short", + "int": "int", + "int64_t": "long", + "uint8_t": "uchar", + "uint16_t": "ushort", + "uint32_t": "uint", + "dace::uint": "uint", + "uint64_t": "ulong", + "dace::float16": "half", +} + # Translation of types to OpenCL vector types _OCL_VECTOR_TYPES = { numpy.int8: "char", @@ -406,6 +424,8 @@ def __init__(self, wrapped_type, typename=None): wrapped_type = numpy.bool_ elif getattr(wrapped_type, '__name__', '') == 'bool_' and typename is None: typename = 'bool' + elif wrapped_type is type(None): + wrapped_type = None self.type = wrapped_type # Type in Python self.ctype = _CTYPES[wrapped_type] # Type in C @@ -1295,7 +1315,7 @@ def dtype_to_typeclass(dtype=None): bool = bool_ TYPECLASS_TO_STRING = { - bool: "dace::bool", + bool: "dace::bool_", bool_: "dace::bool_", uint8: "dace::uint8", uint16: "dace::uint16", diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 88a6b0c54a..c517028d53 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -50,14 +50,14 @@ def _cart_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, dims: Shape @oprepo.replaces_method('Intracomm', 'Create_cart') -def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', dims: ShapeType): +def _intracomm_create(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, dims: ShapeType): """ Equivalent to `dace.comm.Cart_create(dims). :param dims: Shape of the process-grid (see `dims` parameter of `MPI_Cart_create`), e.g., [2, 3, 3]. :return: Name of the new process-grid descriptor. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _cart_create(pv, sdfg, state, dims) @@ -186,13 +186,13 @@ def _bcast(pv: ProgramVisitor, def _intracomm_bcast(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, - comm: Tuple[str, 'Comm'], + comm: str, buffer: str, root: Union[str, sp.Expr, Number] = 0): """ Equivalent to `dace.comm.Bcast(buffer, root)`. """ from mpi4py import MPI - comm_name, comm_obj = comm + comm_name, comm_obj = comm, pv.globals[comm] if comm_obj == MPI.COMM_WORLD: return _bcast(pv, sdfg, state, buffer, root) # NOTE: Highly experimental @@ -267,12 +267,12 @@ def _alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, inbuffer: str, @oprepo.replaces_method('Intracomm', 'Alltoall') -def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: str, +def _intracomm_alltoall(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, inp_buffer: str, out_buffer: str): """ Equivalent to `dace.comm.Alltoall(inp_buffer, out_buffer)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _alltoall(pv, sdfg, state, inp_buffer, out_buffer) @@ -303,12 +303,12 @@ def _allreduce(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, buffer: str, op @oprepo.replaces_method('Intracomm', 'Allreduce') -def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', inp_buffer: 'InPlace', +def _intracomm_allreduce(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, inp_buffer: 'InPlace', out_buffer: str, op: str): """ Equivalent to `dace.comm.Allreduce(out_buffer, op)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') if inp_buffer != MPI.IN_PLACE: @@ -470,12 +470,12 @@ def _send(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Send') -def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_send(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.end(buffer, dst, tag)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _send(pv, sdfg, state, buffer, dst, tag) @@ -592,12 +592,12 @@ def _isend(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Isend') -def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_isend(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, dst: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Isend(buffer, dst, tag, req)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') req, _ = sdfg.add_array("isend_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) @@ -690,12 +690,12 @@ def _recv(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Recv') -def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_Recv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Recv(buffer, src, tagq)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') return _recv(pv, sdfg, state, buffer, src, tag) @@ -810,12 +810,12 @@ def _irecv(pv: ProgramVisitor, @oprepo.replaces_method('Intracomm', 'Irecv') -def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: 'Intracomm', buffer: str, +def _intracomm_irecv(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, icomm: str, buffer: str, src: Union[str, sp.Expr, Number], tag: Union[str, sp.Expr, Number]): """ Equivalent to `dace.comm.Irecv(buffer, src, tag, req)`. """ from mpi4py import MPI - icomm_name, icomm_obj = icomm + icomm_name, icomm_obj = icomm, pv.globals[icomm] if icomm_obj != MPI.COMM_WORLD: raise ValueError('Only the mpi4py.MPI.COMM_WORLD Intracomm is supported in DaCe Python programs.') req, _ = sdfg.add_array("irecv_req", [1], dace.dtypes.opaque("MPI_Request"), transient=True, find_new_name=True) diff --git a/dace/frontend/fortran/ast_components.py b/dace/frontend/fortran/ast_components.py index 332c3a563f..ab0aa9c777 100644 --- a/dace/frontend/fortran/ast_components.py +++ b/dace/frontend/fortran/ast_components.py @@ -987,9 +987,6 @@ def block_nonlabel_do_construct(self, node: FASTNode): body=ast_internal_classes.Execution_Part_Node(execution=body), line_number=do.line_number) - def real_literal_constant(self, node: FASTNode): - return node - def subscript_triplet(self, node: FASTNode): if node.string == ":": return ast_internal_classes.ParDecl_Node(type="ALL") diff --git a/dace/frontend/fortran/ast_transforms.py b/dace/frontend/fortran/ast_transforms.py index 0c96560fba..57508d6d90 100644 --- a/dace/frontend/fortran/ast_transforms.py +++ b/dace/frontend/fortran/ast_transforms.py @@ -184,7 +184,7 @@ def __init__(self, funcs=None): from dace.frontend.fortran.intrinsics import FortranIntrinsics self.excepted_funcs = [ - "malloc", "exp", "pow", "sqrt", "cbrt", "max", "abs", "min", "__dace_sign", "tanh", + "malloc", "pow", "cbrt", "__dace_sign", "tanh", "atan2", "__dace_epsilon", *FortranIntrinsics.function_names() ] @@ -220,7 +220,7 @@ def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node): from dace.frontend.fortran.intrinsics import FortranIntrinsics if not stop and node.name.name not in [ - "malloc", "exp", "pow", "sqrt", "cbrt", "max", "min", "abs", "tanh", "__dace_epsilon", *FortranIntrinsics.call_extraction_exemptions() + "malloc", "pow", "cbrt", "__dace_epsilon", *FortranIntrinsics.call_extraction_exemptions() ]: self.nodes.append(node) return self.generic_visit(node) @@ -241,7 +241,7 @@ def __init__(self, count=0): def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node): from dace.frontend.fortran.intrinsics import FortranIntrinsics - if node.name.name in ["malloc", "exp", "pow", "sqrt", "cbrt", "max", "min", "abs", "tanh", "__dace_epsilon", *FortranIntrinsics.call_extraction_exemptions()]: + if node.name.name in ["malloc", "pow", "cbrt", "__dace_epsilon", *FortranIntrinsics.call_extraction_exemptions()]: return self.generic_visit(node) if hasattr(node, "subroutine"): if node.subroutine is True: @@ -251,6 +251,11 @@ def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node): else: self.count = self.count + 1 tmp = self.count + + for i, arg in enumerate(node.args): + # Ensure we allow to extract function calls from arguments + node.args[i] = self.visit(arg) + return ast_internal_classes.Name_Node(name="tmp_call_" + str(tmp - 1)) def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node): @@ -263,9 +268,13 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No for i in res: if i == child: res.pop(res.index(i)) - temp = self.count if res is not None: - for i in range(0, len(res)): + # Variables are counted from 0...end, starting from main node, to all calls nested + # in main node arguments. + # However, we need to define nested ones first. + # We go in reverse order, counting from end-1 to 0. + temp = self.count + len(res) - 1 + for i in reversed(range(0, len(res))): newbody.append( ast_internal_classes.Decl_Stmt_Node(vardecl=[ @@ -282,7 +291,7 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No type=res[i].type), rval=res[i], line_number=child.line_number)) - temp = temp + 1 + temp = temp - 1 if isinstance(child, ast_internal_classes.Call_Expr_Node): new_args = [] if hasattr(child, "args"): @@ -368,7 +377,8 @@ def __init__(self): self.nodes: List[ast_internal_classes.Array_Subscript_Node] = [] def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node): - if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]: + from dace.frontend.fortran.intrinsics import FortranIntrinsics + if node.name.name in ["pow", "atan2", "tanh", *FortranIntrinsics.retained_function_names()]: return self.generic_visit(node) else: return @@ -401,7 +411,8 @@ def __init__(self, ast: ast_internal_classes.FNode, normalize_offsets: bool = Fa self.scope_vars.visit(ast) def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node): - if node.name.name in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh"]: + from dace.frontend.fortran.intrinsics import FortranIntrinsics + if node.name.name in ["pow", "atan2", "tanh", *FortranIntrinsics.retained_function_names()]: return self.generic_visit(node) else: return node diff --git a/dace/frontend/fortran/ast_utils.py b/dace/frontend/fortran/ast_utils.py index 41cbeff1f9..b52bd31df7 100644 --- a/dace/frontend/fortran/ast_utils.py +++ b/dace/frontend/fortran/ast_utils.py @@ -188,8 +188,31 @@ def intlit2string(self, node: ast_internal_classes.Int_Literal_Node): return "".join(map(str, node.value)) def floatlit2string(self, node: ast_internal_classes.Real_Literal_Node): - - return "".join(map(str, node.value)) + # Typecheck and crash early if unexpected. + assert hasattr(node, 'value') + lit = node.value + assert isinstance(lit, str) + + # Fortran "real literals" may have an additional suffix at the end. + # Examples: + # valid: 1.0 => 1 + # valid: 1. => 1 + # valid: 1.e5 => 1e5 + # valid: 1.d5 => 1e5 + # valid: 1._kinder => 1 (precondition: somewhere earlier, `integer, parameter :: kinder=8`) + # valid: 1.e5_kinder => 1e5 + # not valid: 1.d5_kinder => 1e5 + # TODO: Is there a complete spec of the structure of real literals? + if '_' in lit: + # First, deal with kind specification and remove it altogether, since we know the type anyway. + parts = lit.split('_') + assert 1 <= len(parts) <= 2, f"{lit} is not a valid fortran literal." + lit = parts[0] + assert 'd' not in lit, f"{lit} is not a valid fortran literal." + if 'd' in lit: + # Again, since we know the type anyway, here we just make the s/d/e/ replacement. + lit = lit.replace('d', 'e') + return f"{float(lit)}" def boollit2string(self, node: ast_internal_classes.Bool_Literal_Node): diff --git a/dace/frontend/fortran/fortran_parser.py b/dace/frontend/fortran/fortran_parser.py index 52344c141f..1cdecc99a8 100644 --- a/dace/frontend/fortran/fortran_parser.py +++ b/dace/frontend/fortran/fortran_parser.py @@ -818,7 +818,8 @@ def binop2sdfg(self, node: ast_internal_classes.BinOp_Node, sdfg: SDFG, cfg: Con calls.visit(node) if len(calls.nodes) == 1: augmented_call = calls.nodes[0] - if augmented_call.name.name not in ["sqrt", "exp", "pow", "max", "min", "abs", "tanh", "__dace_epsilon"]: + from dace.frontend.fortran.intrinsics import FortranIntrinsics + if augmented_call.name.name not in ["pow", "atan2", "tanh", "__dace_epsilon", *FortranIntrinsics.retained_function_names()]: augmented_call.args.append(node.lval) augmented_call.hasret = True self.call2sdfg(augmented_call, sdfg, cfg) @@ -1090,7 +1091,8 @@ def create_ast_from_string( program = ast_transforms.ArrayToLoop(program).visit(program) for transformation in own_ast.fortran_intrinsics().transformations(): - program = transformation(program).visit(program) + transformation.initialize(program) + program = transformation.visit(program) program = ast_transforms.ForDeclarer().visit(program) program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program) @@ -1126,7 +1128,8 @@ def create_sdfg_from_string( program = ast_transforms.ArrayToLoop(program).visit(program) for transformation in own_ast.fortran_intrinsics().transformations(): - program = transformation(program).visit(program) + transformation.initialize(program) + program = transformation.visit(program) program = ast_transforms.ForDeclarer().visit(program) program = ast_transforms.IndexExtractor(program, normalize_offsets).visit(program) @@ -1172,7 +1175,8 @@ def create_sdfg_from_fortran_file(source_string: str, use_experimental_cfg_block program = ast_transforms.ArrayToLoop(program).visit(program) for transformation in own_ast.fortran_intrinsics(): - program = transformation(program).visit(program) + transformation.initialize(program) + program = transformation.visit(program) program = ast_transforms.ForDeclarer().visit(program) program = ast_transforms.IndexExtractor(program).visit(program) diff --git a/dace/frontend/fortran/intrinsics.py b/dace/frontend/fortran/intrinsics.py index c2e5afe79b..af44a8dfb5 100644 --- a/dace/frontend/fortran/intrinsics.py +++ b/dace/frontend/fortran/intrinsics.py @@ -2,6 +2,7 @@ from abc import abstractmethod import copy import math +from collections import namedtuple from typing import Any, List, Optional, Set, Tuple, Type from dace.frontend.fortran import ast_internal_classes @@ -26,34 +27,175 @@ def replace(func_name: ast_internal_classes.Name_Node, args: ast_internal_classe def has_transformation() -> bool: return False -class SelectedKind(IntrinsicTransformation): +class IntrinsicNodeTransformer(NodeTransformer): + + def initialize(self, ast): + # We need to rerun the assignment because transformations could have created + # new AST nodes + ParentScopeAssigner().visit(ast) + self.scope_vars = ScopeVarsDeclarations() + self.scope_vars.visit(ast) + + @staticmethod + @abstractmethod + def func_name(self) -> str: + pass + +class DirectReplacement(IntrinsicTransformation): + + Replacement = namedtuple("Replacement", "function") + Transformation = namedtuple("Transformation", "function") + + class ASTTransformation(IntrinsicNodeTransformer): + + def visit_BinOp_Node(self, binop_node: ast_internal_classes.BinOp_Node): + + if not isinstance(binop_node.rval, ast_internal_classes.Call_Expr_Node): + return binop_node + + node = binop_node.rval + + name = node.name.name.split('__dace_') + if len(name) != 2 or name[1] not in DirectReplacement.FUNCTIONS: + return binop_node + func_name = name[1] + + replacement_rule = DirectReplacement.FUNCTIONS[func_name] + if isinstance(replacement_rule, DirectReplacement.Transformation): + + # FIXME: we do not have line number in binop? + binop_node.rval, input_type = replacement_rule.function(node, self.scope_vars, 0) #binop_node.line) + print(binop_node, binop_node.lval, binop_node.rval) + + # replace types of return variable - LHS of the binary operator + var = binop_node.lval + if isinstance(var.name, ast_internal_classes.Name_Node): + name = var.name.name + else: + name = var.name + var_decl = self.scope_vars.get_var(var.parent, name) + var.type = input_type + var_decl.type = input_type + + return binop_node + + + #self.scope_vars.get_var(node.parent, arg.name). + + def replace_size(var: ast_internal_classes.Call_Expr_Node, scope_vars: ScopeVarsDeclarations, line): + + if len(var.args) not in [1, 2]: + raise RuntimeError() + + # get variable declaration for the first argument + var_decl = scope_vars.get_var(var.parent, var.args[0].name) + + # one arg to SIZE? compute the total number of elements + if len(var.args) == 1: + + if len(var_decl.sizes) == 1: + return (var_decl.sizes[0], "INTEGER") + + ret = ast_internal_classes.BinOp_Node( + lval=var_decl.sizes[0], + rval=None, + op="*" + ) + cur_node = ret + for i in range(1, len(var_decl.sizes) - 1): + + cur_node.rval = ast_internal_classes.BinOp_Node( + lval=var_decl.sizes[i], + rval=None, + op="*" + ) + cur_node = cur_node.rval + + cur_node.rval = var_decl.sizes[-1] + return (ret, "INTEGER") + + # two arguments? We return number of elements in a given rank + rank = var.args[1] + # we do not support symbolic argument to DIM - it must be a literal + if not isinstance(rank, ast_internal_classes.Int_Literal_Node): + raise NotImplementedError() + value = int(rank.value) + return (var_decl.sizes[value-1], "INTEGER") + + + def replace_bit_size(var: ast_internal_classes.Call_Expr_Node, scope_vars: ScopeVarsDeclarations, line): + + if len(var.args) != 1: + raise RuntimeError() + + # get variable declaration for the first argument + var_decl = scope_vars.get_var(var.parent, var.args[0].name) + + dace_type = fortrantypes2dacetypes[var_decl.type] + type_size = dace_type().itemsize * 8 + + return (ast_internal_classes.Int_Literal_Node(value=str(type_size)), "INTEGER") + + + def replace_int_kind(args: ast_internal_classes.Arg_List_Node, line): + return ast_internal_classes.Int_Literal_Node(value=str( + math.ceil((math.log2(math.pow(10, int(args.args[0].value))) + 1) / 8)), + line_number=line) + + def replace_real_kind(args: ast_internal_classes.Arg_List_Node, line): + if int(args.args[0].value) >= 9 or int(args.args[1].value) > 126: + return ast_internal_classes.Int_Literal_Node(value="8", line_number=line) + elif int(args.args[0].value) >= 3 or int(args.args[1].value) > 14: + return ast_internal_classes.Int_Literal_Node(value="4", line_number=line) + else: + return ast_internal_classes.Int_Literal_Node(value="2", line_number=line) + FUNCTIONS = { - "SELECTED_INT_KIND": "__dace_selected_int_kind", - "SELECTED_REAL_KIND": "__dace_selected_real_kind", + "SELECTED_INT_KIND": Replacement(replace_int_kind), + "SELECTED_REAL_KIND": Replacement(replace_real_kind), + "BIT_SIZE": Transformation(replace_bit_size), + "SIZE": Transformation(replace_size) } @staticmethod - def replaced_name(func_name: str) -> str: - return SelectedKind.FUNCTIONS[func_name] + def temporary_functions(): + + # temporary functions created by us -> f becomes __dace_f + # We provide this to tell Fortran parser that these are function calls, + # not array accesses + funcs = list(DirectReplacement.FUNCTIONS.keys()) + return [f'__dace_{f}' for f in funcs] + + @staticmethod + def replacable_name(func_name: str) -> bool: + return func_name in DirectReplacement.FUNCTIONS + + @staticmethod + def replace_name(func_name: str) -> str: + #return ast_internal_classes.Name_Node(name=DirectReplacement.FUNCTIONS[func_name][0]) + return ast_internal_classes.Name_Node(name=f'__dace_{func_name}') + + @staticmethod + def replacable(func_name: str) -> bool: + orig_name = func_name.split('__dace_') + if len(orig_name) > 1 and orig_name[1] in DirectReplacement.FUNCTIONS: + return isinstance(DirectReplacement.FUNCTIONS[orig_name[1]], DirectReplacement.Replacement) + return False @staticmethod def replace(func_name: ast_internal_classes.Name_Node, args: ast_internal_classes.Arg_List_Node, line) -> ast_internal_classes.FNode: - if func_name.name == "__dace_selected_int_kind": - return ast_internal_classes.Int_Literal_Node(value=str( - math.ceil((math.log2(math.pow(10, int(args.args[0].value))) + 1) / 8)), - line_number=line) - # This selects the smallest kind that can hold the given number of digits (fp64,fp32 or fp16) - elif func_name.name == "__dace_selected_real_kind": - if int(args.args[0].value) >= 9 or int(args.args[1].value) > 126: - return ast_internal_classes.Int_Literal_Node(value="8", line_number=line) - elif int(args.args[0].value) >= 3 or int(args.args[1].value) > 14: - return ast_internal_classes.Int_Literal_Node(value="4", line_number=line) - else: - return ast_internal_classes.Int_Literal_Node(value="2", line_number=line) + # Here we already have __dace_func + fname = func_name.split('__dace_')[1] + return DirectReplacement.FUNCTIONS[fname].function(args, line) - raise NotImplemented() + def has_transformation(fname: str) -> bool: + return isinstance(DirectReplacement.FUNCTIONS[fname], DirectReplacement.Transformation) + + @staticmethod + def get_transformation() -> IntrinsicNodeTransformer: + return DirectReplacement.ASTTransformation() class LoopBasedReplacement: @@ -84,36 +226,34 @@ class LoopBasedReplacementVisitor(NodeVisitor): def __init__(self, func_name: str): self._func_name = func_name self.nodes: List[ast_internal_classes.FNode] = [] + self.calls: List[ast_internal_classes.FNode] = [] def visit_BinOp_Node(self, node: ast_internal_classes.BinOp_Node): - if isinstance(node.rval, ast_internal_classes.Call_Expr_Node): if node.rval.name.name == self._func_name: self.nodes.append(node) + self.calls.append(node.rval) + self.visit(node.lval) + self.visit(node.rval) + + def visit_Call_Expr_Node(self, node: ast_internal_classes.Call_Expr_Node): + + if node.name.name == self._func_name: + if node not in self.calls: + self.nodes.append(node) def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_Node): return -class LoopBasedReplacementTransformation(NodeTransformer): +class LoopBasedReplacementTransformation(IntrinsicNodeTransformer): """ Transforms the AST by removing intrinsic call and replacing it with loops """ - def __init__(self, ast): + def __init__(self): self.count = 0 - - # We need to rerun the assignment because transformations could have created - # new AST nodes - ParentScopeAssigner().visit(ast) - self.scope_vars = ScopeVarsDeclarations() - self.scope_vars.visit(ast) self.rvals = [] - @staticmethod - @abstractmethod - def func_name() -> str: - pass - @abstractmethod def _initialize(self): pass @@ -338,9 +478,6 @@ def visit_Execution_Part_Node(self, node: ast_internal_classes.Execution_Part_No class SumProduct(LoopBasedReplacementTransformation): - def __init__(self, ast): - super().__init__(ast) - def _initialize(self): self.rvals = [] self.argument_variable = None @@ -414,9 +551,6 @@ class Sum(LoopBasedReplacement): class Transformation(SumProduct): - def __init__(self, ast): - super().__init__(ast) - @staticmethod def func_name() -> str: return "__dace_sum" @@ -440,9 +574,6 @@ class Product(LoopBasedReplacement): class Transformation(SumProduct): - def __init__(self, ast): - super().__init__(ast) - @staticmethod def func_name() -> str: return "__dace_product" @@ -455,9 +586,6 @@ def _result_update_op(self): class AnyAllCountTransformation(LoopBasedReplacementTransformation): - def __init__(self, ast): - super().__init__(ast) - def _initialize(self): self.rvals = [] @@ -575,9 +703,6 @@ class Any(LoopBasedReplacement): """ class Transformation(AnyAllCountTransformation): - def __init__(self, ast): - super().__init__(ast) - def _result_init_value(self): return "0" @@ -607,9 +732,6 @@ class All(LoopBasedReplacement): """ class Transformation(AnyAllCountTransformation): - def __init__(self, ast): - super().__init__(ast) - def _result_init_value(self): return "1" @@ -644,9 +766,6 @@ class Count(LoopBasedReplacement): """ class Transformation(AnyAllCountTransformation): - def __init__(self, ast): - super().__init__(ast) - def _result_init_value(self): return "0" @@ -675,9 +794,6 @@ def func_name() -> str: class MinMaxValTransformation(LoopBasedReplacementTransformation): - def __init__(self, ast): - super().__init__(ast) - def _initialize(self): self.rvals = [] self.argument_variable = None @@ -753,9 +869,6 @@ class MinVal(LoopBasedReplacement): """ class Transformation(MinMaxValTransformation): - def __init__(self, ast): - super().__init__(ast) - def _result_init_value(self, array: ast_internal_classes.Array_Subscript_Node): var_decl = self.scope_vars.get_var(array.parent, array.name.name) @@ -788,9 +901,6 @@ class MaxVal(LoopBasedReplacement): """ class Transformation(MinMaxValTransformation): - def __init__(self, ast): - super().__init__(ast) - def _result_init_value(self, array: ast_internal_classes.Array_Subscript_Node): var_decl = self.scope_vars.get_var(array.parent, array.name.name) @@ -817,9 +927,6 @@ class Merge(LoopBasedReplacement): class Transformation(LoopBasedReplacementTransformation): - def __init__(self, ast): - super().__init__(ast) - def _initialize(self): self.rvals = [] @@ -939,11 +1046,235 @@ def _generate_loop_body(self, node: ast_internal_classes.FNode) -> ast_internal_ line_number=node.line_number ) +class MathFunctions(IntrinsicTransformation): + + MathTransformation = namedtuple("MathTransformation", "function return_type") + MathReplacement = namedtuple("MathReplacement", "function replacement_function return_type") + + def generate_scale(arg: ast_internal_classes.Call_Expr_Node): + + # SCALE(X, I) becomes: X * pow(RADIX(X), I) + # In our case, RADIX(X) is always 2 + line = arg.line_number + x = arg.args[0] + i = arg.args[1] + const_two = ast_internal_classes.Int_Literal_Node(value="2") + + # I and RADIX(X) are both integers + rval = ast_internal_classes.Call_Expr_Node( + name=ast_internal_classes.Name_Node(name="pow"), + type="INTEGER", + args=[const_two, i], + line_number=line + ) + + mult = ast_internal_classes.BinOp_Node( + op="*", + lval=x, + rval=rval, + line_number=line + ) + + # pack it into parentheses, just to be sure + return ast_internal_classes.Parenthesis_Expr_Node(expr=mult) + + def generate_aint(arg: ast_internal_classes.Call_Expr_Node): + + # The call to AINT can contain a second KIND parameter + # We ignore it a the moment. + # However, to map into C's trunc, we need to drop it. + if len(arg.args) > 1: + del arg.args[1] + + fname = arg.name.name.split('__dace_')[1] + if fname in "AINT": + arg.name = ast_internal_classes.Name_Node(name="trunc") + elif fname == "NINT": + arg.name = ast_internal_classes.Name_Node(name="iround") + elif fname == "ANINT": + arg.name = ast_internal_classes.Name_Node(name="round") + else: + raise NotImplementedError() + + return arg + + INTRINSIC_TO_DACE = { + "MIN": MathTransformation("min", "FIRST_ARG"), + "MAX": MathTransformation("max", "FIRST_ARG"), + "SQRT": MathTransformation("sqrt", "FIRST_ARG"), + "ABS": MathTransformation("abs", "FIRST_ARG"), + "EXP": MathTransformation("exp", "FIRST_ARG"), + # Documentation states that the return type of LOG is always REAL, + # but the kind is the same as of the first argument. + # However, we already replaced kind with types used in DaCe. + # Thus, a REAL that is really DOUBLE will be double in the first argument. + "LOG": MathTransformation("log", "FIRST_ARG"), + "MOD": { + "INTEGER": MathTransformation("Mod", "INTEGER"), + "REAL": MathTransformation("Mod_float", "REAL"), + "DOUBLE": MathTransformation("Mod_float", "DOUBLE") + }, + "MODULO": { + "INTEGER": MathTransformation("Modulo", "INTEGER"), + "REAL": MathTransformation("Modulo_float", "REAL"), + "DOUBLE": MathTransformation("Modulo_float", "DOUBLE") + }, + "FLOOR": { + "REAL": MathTransformation("floor", "INTEGER"), + "DOUBLE": MathTransformation("floor", "INTEGER") + }, + "SCALE": MathReplacement(None, generate_scale, "FIRST_ARG"), + "EXPONENT": MathTransformation("frexp", "INTEGER"), + "INT": MathTransformation("int", "INTEGER"), + "AINT": MathReplacement("trunc", generate_aint, "FIRST_ARG"), + "NINT": MathReplacement("iround", generate_aint, "INTEGER"), + "ANINT": MathReplacement("round", generate_aint, "FIRST_ARG"), + "REAL": MathTransformation("float", "REAL"), + "DBLE": MathTransformation("double", "DOUBLE"), + "SIN": MathTransformation("sin", "FIRST_ARG"), + "COS": MathTransformation("cos", "FIRST_ARG"), + "SINH": MathTransformation("sinh", "FIRST_ARG"), + "COSH": MathTransformation("cosh", "FIRST_ARG"), + "TANH": MathTransformation("tanh", "FIRST_ARG"), + "ASIN": MathTransformation("asin", "FIRST_ARG"), + "ACOS": MathTransformation("acos", "FIRST_ARG"), + "ATAN": MathTransformation("atan", "FIRST_ARG"), + "ATAN2": MathTransformation("atan2", "FIRST_ARG") + } + + class TypeTransformer(IntrinsicNodeTransformer): + + def func_type(self, node: ast_internal_classes.Call_Expr_Node): + + # take the first arg + arg = node.args[0] + if isinstance(arg, ast_internal_classes.Real_Literal_Node): + return 'REAL' + elif isinstance(arg, ast_internal_classes.Int_Literal_Node): + return 'INTEGER' + elif isinstance(arg, ast_internal_classes.Call_Expr_Node): + return arg.type + elif isinstance(arg, ast_internal_classes.Name_Node): + input_type = self.scope_vars.get_var(node.parent, arg.name) + return input_type.type + else: + input_type = self.scope_vars.get_var(node.parent, arg.name.name) + return input_type.type + + def replace_call(self, old_call: ast_internal_classes.Call_Expr_Node, new_call: ast_internal_classes.FNode): + + parent = old_call.parent + + # We won't need it if the CallExtractor will properly support nested function calls. + # Then, all function calls should be a binary op: val = func() + if isinstance(parent, ast_internal_classes.BinOp_Node): + if parent.lval == old_call: + parent.lval = new_call + else: + parent.rval = new_call + elif isinstance(parent, ast_internal_classes.UnOp_Node): + parent.lval = new_call + elif isinstance(parent, ast_internal_classes.Parenthesis_Expr_Node): + parent.expr = new_call + elif isinstance(parent, ast_internal_classes.Call_Expr_Node): + for idx, arg in enumerate(parent.args): + if arg == old_call: + parent.args[idx] = new_call + break + else: + raise NotImplementedError() + + def visit_BinOp_Node(self, binop_node: ast_internal_classes.BinOp_Node): + + if not isinstance(binop_node.rval, ast_internal_classes.Call_Expr_Node): + return binop_node + + node = binop_node.rval + + name = node.name.name.split('__dace_') + if len(name) != 2 or name[1] not in MathFunctions.INTRINSIC_TO_DACE: + return binop_node + func_name = name[1] + + # Visit all children before we expand this call. + # We need that to properly get the type. + for arg in node.args: + self.visit(arg) + + return_type = None + input_type = None + input_type = self.func_type(node) + + replacement_rule = MathFunctions.INTRINSIC_TO_DACE[func_name] + if isinstance(replacement_rule, dict): + replacement_rule = replacement_rule[input_type] + if replacement_rule.return_type == "FIRST_ARG": + return_type = input_type + else: + return_type = replacement_rule.return_type + + if isinstance(replacement_rule, MathFunctions.MathTransformation): + node.name = ast_internal_classes.Name_Node(name=replacement_rule.function) + node.type = return_type + + else: + binop_node.rval = replacement_rule.replacement_function(node) + + # replace types of return variable - LHS of the binary operator + var = binop_node.lval + name = None + if isinstance(var.name, ast_internal_classes.Name_Node): + name = var.name.name + else: + name = var.name + var_decl = self.scope_vars.get_var(var.parent, name) + var.type = input_type + var_decl.type = input_type + + return binop_node + + @staticmethod + def dace_functions(): + + # list of final dace functions which we create + funcs = list(MathFunctions.INTRINSIC_TO_DACE.values()) + res = [] + # flatten nested lists + for f in funcs: + if isinstance(f, dict): + res.extend([v.function for k, v in f.items() if v.function is not None]) + else: + if f.function is not None: + res.append(f.function) + return res + + @staticmethod + def temporary_functions(): + + # temporary functions created by us -> f becomes __dace_f + # We provide this to tell Fortran parser that these are function calls, + # not array accesses + funcs = list(MathFunctions.INTRINSIC_TO_DACE.keys()) + return [f'__dace_{f}' for f in funcs] + + @staticmethod + def replacable(func_name: str) -> bool: + return func_name in MathFunctions.INTRINSIC_TO_DACE + + @staticmethod + def replace(func_name: str) -> ast_internal_classes.FNode: + return ast_internal_classes.Name_Node(name=f'__dace_{func_name}') + + def has_transformation() -> bool: + return True + + @staticmethod + def get_transformation() -> TypeTransformer: + return MathFunctions.TypeTransformer() + class FortranIntrinsics: IMPLEMENTATIONS_AST = { - "SELECTED_INT_KIND": SelectedKind, - "SELECTED_REAL_KIND": SelectedKind, "SUM": Sum, "PRODUCT": Product, "ANY": Any, @@ -954,11 +1285,6 @@ class FortranIntrinsics: "MERGE": Merge } - DIRECT_REPLACEMENTS = { - "__dace_selected_int_kind": SelectedKind, - "__dace_selected_real_kind": SelectedKind - } - EXEMPTED_FROM_CALL_EXTRACTION = [ Merge ] @@ -971,59 +1297,58 @@ def transformations(self) -> Set[Type[NodeTransformer]]: @staticmethod def function_names() -> List[str]: - return list(LoopBasedReplacement.INTRINSIC_TO_DACE.values()) + # list of all functions that are created by initial transformation, before doing full replacement + # this prevents other parser components from replacing our function calls with array subscription nodes + return [*list(LoopBasedReplacement.INTRINSIC_TO_DACE.values()), *MathFunctions.temporary_functions(), *DirectReplacement.temporary_functions()] + + @staticmethod + def retained_function_names() -> List[str]: + # list of all DaCe functions that we use after full parsing + return MathFunctions.dace_functions() @staticmethod def call_extraction_exemptions() -> List[str]: - return [func.Transformation.func_name() for func in FortranIntrinsics.EXEMPTED_FROM_CALL_EXTRACTION] + return [ + *[func.Transformation.func_name() for func in FortranIntrinsics.EXEMPTED_FROM_CALL_EXTRACTION] + #*MathFunctions.temporary_functions() + ] def replace_function_name(self, node: FASTNode) -> ast_internal_classes.Name_Node: func_name = node.string replacements = { - "INT": "__dace_int", - "DBLE": "__dace_dble", - "SQRT": "sqrt", - "COSH": "cosh", - "ABS": "abs", - "MIN": "min", - "MAX": "max", - "EXP": "exp", - "EPSILON": "__dace_epsilon", - "TANH": "tanh", "SIGN": "__dace_sign", - "EXP": "exp" } if func_name in replacements: return ast_internal_classes.Name_Node(name=replacements[func_name]) - else: - - if self.IMPLEMENTATIONS_AST[func_name].has_transformation(): - self._transformations_to_run.add(self.IMPLEMENTATIONS_AST[func_name].Transformation) + elif DirectReplacement.replacable_name(func_name): + if DirectReplacement.has_transformation(func_name): + self._transformations_to_run.add(DirectReplacement.get_transformation()) + return DirectReplacement.replace_name(func_name) + elif MathFunctions.replacable(func_name): + self._transformations_to_run.add(MathFunctions.get_transformation()) + return MathFunctions.replace(func_name) + + if self.IMPLEMENTATIONS_AST[func_name].has_transformation(): + + if hasattr(self.IMPLEMENTATIONS_AST[func_name], "Transformation"): + self._transformations_to_run.add(self.IMPLEMENTATIONS_AST[func_name].Transformation()) + else: + self._transformations_to_run.add(self.IMPLEMENTATIONS_AST[func_name].get_transformation(func_name)) - return ast_internal_classes.Name_Node(name=self.IMPLEMENTATIONS_AST[func_name].replaced_name(func_name)) + return ast_internal_classes.Name_Node(name=self.IMPLEMENTATIONS_AST[func_name].replaced_name(func_name)) def replace_function_reference(self, name: ast_internal_classes.Name_Node, args: ast_internal_classes.Arg_List_Node, line): func_types = { - "__dace_int": "INT", - "__dace_dble": "DOUBLE", - "sqrt": "DOUBLE", - "cosh": "DOUBLE", - "abs": "DOUBLE", - "min": "DOUBLE", - "max": "DOUBLE", - "exp": "DOUBLE", - "__dace_epsilon": "DOUBLE", - "tanh": "DOUBLE", "__dace_sign": "DOUBLE", } if name.name in func_types: # FIXME: this will be progressively removed call_type = func_types[name.name] return ast_internal_classes.Call_Expr_Node(name=name, type=call_type, args=args.args, line_number=line) - elif name.name in self.DIRECT_REPLACEMENTS: - return self.DIRECT_REPLACEMENTS[name.name].replace(name, args, line) + elif DirectReplacement.replacable(name.name): + return DirectReplacement.replace(name.name, args, line) else: # We will do the actual type replacement later # To that end, we need to know the input types - but these we do not know at the moment. diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index cacf15d785..1cbb8e67c9 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1342,7 +1342,7 @@ def defined(self): # MPI-related stuff result.update({ - k: self.sdfg.process_grids[v] + v: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids }) try: @@ -1489,19 +1489,19 @@ def _symbols_from_params(self, params: List[Tuple[str, Union[str, dtypes.typecla else: values = str(val).split(':') if len(values) == 1: - result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs})) + result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs})) elif len(values) == 2: result[name] = symbolic.symbol( name, dtypes.result_type_of(infer_expr_type(values[0], { - **self.globals, + **self.defined, **dyn_inputs }), infer_expr_type(values[1], { - **self.globals, + **self.defined, **dyn_inputs }))) elif len(values) == 3: - result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.globals, **dyn_inputs})) + result[name] = symbolic.symbol(name, infer_expr_type(values[0], {**self.defined, **dyn_inputs})) else: raise DaceSyntaxError( self, None, "Invalid number of arguments in a range iterator. " @@ -3258,18 +3258,23 @@ def visit_AnnAssign(self, node: ast.AnnAssign): dtype = astutils.evalnode(node.annotation, {**self.globals, **self.defined}) if isinstance(dtype, data.Data): simple_type = dtype.dtype + storage = dtype.storage else: simple_type = dtype + storage = dtypes.StorageType.Default if not isinstance(simple_type, dtypes.typeclass): raise TypeError except: dtype = None + storage = dtypes.StorageType.Default type_name = rname(node.annotation) warnings.warn('typeclass {} is not supported'.format(type_name)) if node.value is None and dtype is not None: # Annotating type without assignment self.annotated_types[rname(node.target)] = dtype return - self._visit_assign(node, node.target, None, dtype=dtype) + results = self._visit_assign(node, node.target, None, dtype=dtype) + if storage != dtypes.StorageType.Default: + self.sdfg.arrays[results[0][0]].storage = storage def _visit_assign(self, node, node_target, op, dtype=None, is_return=False): # Get targets (elts) and results @@ -3563,6 +3568,8 @@ def _visit_assign(self, node, node_target, op, dtype=None, is_return=False): self.cfg_target.add_edge(self.last_block, output_indirection, dace.sdfg.InterstateEdge()) self.last_block = output_indirection + return results + def visit_AugAssign(self, node: ast.AugAssign): self._visit_assign(node, node.target, augassign_ops[type(node.op).__name__]) @@ -3933,6 +3940,9 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no for arg in args_to_remove: args.remove(arg) + # Refresh temporary transient counter of the nested SDFG + sdfg.refresh_temp_transients() + # Change connector names updated_args = [] arrays_before = list(sdfg.arrays.items()) @@ -4454,7 +4464,14 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): func = node.func.value if func is None: - funcname = rname(node) + func_result = self.visit(node.func) + if isinstance(func_result, str): + if isinstance(node.func, ast.Attribute): + funcname = f'{func_result}.{node.func.attr}' + else: + funcname = func_result + else: + funcname = rname(node) # Check if the function exists as an SDFG in a different module modname = until(funcname, '.') if ('.' in funcname and len(modname) > 0 and modname in self.globals @@ -4569,7 +4586,7 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): arg = self.scope_vars[modname] else: # Fallback to (name, object) - arg = (modname, self.defined[modname]) + arg = modname args.append(arg) # Otherwise, try to find a default implementation for the SDFG elif not found_ufunc: @@ -4623,10 +4640,16 @@ def visit_Call(self, node: ast.Call, create_callbacks=False): self._add_state('call_%d' % node.lineno) self.last_block.set_default_lineinfo(self.current_lineinfo) - if found_ufunc: - result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords) - else: - result = func(self, self.sdfg, self.last_block, *args, **keywords) + try: + if found_ufunc: + result = func(self, node, self.sdfg, self.last_block, ufunc_name, args, keywords) + else: + result = func(self, self.sdfg, self.last_block, *args, **keywords) + except DaceSyntaxError as ex: + # Attach source information to exception + if ex.node is None: + ex.node = node + raise self.last_block.set_default_lineinfo(None) @@ -4782,12 +4805,18 @@ def _visitname(self, name: str, node: ast.AST): self.sdfg.add_symbol(result.name, result.dtype) return result + if name in self.closure.callbacks: + return name + if name in self.sdfg.arrays: return name if name in self.sdfg.symbols: return name + if name in __builtins__: + return name + if name not in self.scope_vars: raise DaceSyntaxError(self, node, 'Use of undefined variable "%s"' % name) rname = self.scope_vars[name] @@ -4832,33 +4861,43 @@ def visit_NameConstant(self, node: NameConstant): return self.visit_Constant(node) def visit_Attribute(self, node: ast.Attribute): - # If visiting an attribute, return attribute value if it's of an array or global - name = until(astutils.unparse(node), '.') - result = self._visitname(name, node) + result = self.visit(node.value) + if isinstance(result, (tuple, list, dict)): + if len(result) > 1: + raise DaceSyntaxError( + self, node.value, f'{type(result)} object cannot use attributes. Try storing the ' + 'object to a different variable first (e.g., ``a = result; a.attribute``') + else: + result = result[0] + tmpname = f"{result}.{astutils.unparse(node.attr)}" if tmpname in self.sdfg.arrays: return tmpname + if isinstance(result, str) and result in self.sdfg.arrays: arr = self.sdfg.arrays[result] elif isinstance(result, str) and result in self.scope_arrays: arr = self.scope_arrays[result] else: - return result + arr = None # Try to find sub-SDFG attribute - func = oprepo.Replacements.get_attribute(type(arr), node.attr) - if func is not None: - # A new state is likely needed here, e.g., for transposition (ndarray.T) - self._add_state('%s_%d' % (type(node).__name__, node.lineno)) - self.last_block.set_default_lineinfo(self.current_lineinfo) - result = func(self, self.sdfg, self.last_block, result) - self.last_block.set_default_lineinfo(None) - return result + if arr is not None: + func = oprepo.Replacements.get_attribute(type(arr), node.attr) + if func is not None: + # A new state is likely needed here, e.g., for transposition (ndarray.T) + self._add_state('%s_%d' % (type(node).__name__, node.lineno)) + self.last_block.set_default_lineinfo(self.current_lineinfo) + result = func(self, self.sdfg, self.last_block, result) + self.last_block.set_default_lineinfo(None) + return result # Otherwise, try to find compile-time attribute (such as shape) try: - return getattr(arr, node.attr) - except KeyError: + if arr is not None: + return getattr(arr, node.attr) + return getattr(result, node.attr) + except (AttributeError, KeyError): return result def visit_List(self, node: ast.List): diff --git a/dace/frontend/python/parser.py b/dace/frontend/python/parser.py index d99be1265d..20018effd0 100644 --- a/dace/frontend/python/parser.py +++ b/dace/frontend/python/parser.py @@ -92,14 +92,15 @@ def infer_symbols_from_datadescriptor(sdfg: SDFG, desc = sdfg.arrays[arg_name] if not hasattr(desc, 'shape') or not hasattr(arg_val, 'shape'): continue - symbolic_values = list(desc.shape) + list(getattr(desc, 'strides', [])) + symbolic_values = list(desc.shape) + list(getattr(desc, 'strides', [])) + list(getattr(desc, 'offset', [])) given_values = list(arg_val.shape) given_strides = [] if hasattr(arg_val, 'strides'): # NumPy arrays use bytes in strides factor = getattr(arg_val, 'itemsize', 1) given_strides = [s // factor for s in arg_val.strides] - given_values += given_strides + given_offset = [o for o in arg_val.offset] if hasattr(arg_val, 'offset') else [] + given_values += given_strides + given_offset for sym_dim, real_dim in zip(symbolic_values, given_values): repldict = {} @@ -759,7 +760,7 @@ def _load_sdfg(self, path: str, *args, **kwargs): if sdfg is not None: # Set regenerate and recompile flags - sdfg._regenerate_code = self.regenerate_code + sdfg.regenerate_code = self.regenerate_code sdfg._recompile = self.recompile return sdfg, self._cache.make_key(argtypes, given_args, self.closure_array_keys, self.closure_constant_keys, @@ -927,7 +928,7 @@ def _generate_pdp(self, args: Tuple[Any], kwargs: Dict[str, Any], # TODO: Add to parsed SDFG cache # Set regenerate and recompile flags - sdfg._regenerate_code = self.regenerate_code + sdfg.regenerate_code = self.regenerate_code sdfg._recompile = self.recompile return sdfg, cached diff --git a/dace/frontend/python/preprocessing.py b/dace/frontend/python/preprocessing.py index eca07a4930..f51b67ddb2 100644 --- a/dace/frontend/python/preprocessing.py +++ b/dace/frontend/python/preprocessing.py @@ -527,6 +527,8 @@ def global_value_to_node(self, elif isinstance(value, symbolic.symbol): # Symbols resolve to the symbol name newnode = ast.Name(id=value.name, ctx=ast.Load()) + elif isinstance(value, sympy.Basic): # Symbolic or constant expression + newnode = ast.parse(symbolic.symstr(value)).body[0].value elif isinstance(value, ast.Name): newnode = ast.Name(id=value.id, ctx=ast.Load()) elif (dtypes.isconstant(value) or isinstance(value, (StringLiteral, SDFG)) or hasattr(value, '__sdfg__')): diff --git a/dace/frontend/python/replacements.py b/dace/frontend/python/replacements.py index 5e6118a34b..c5b3e3b2a2 100644 --- a/dace/frontend/python/replacements.py +++ b/dace/frontend/python/replacements.py @@ -313,6 +313,9 @@ def _numpy_full(pv: ProgramVisitor, """ Creates and array of the specified shape and initializes it with the fill value. """ + if isinstance(shape, Number) or symbolic.issymbolic(shape): + shape = [shape] + is_data = False if isinstance(fill_value, (Number, np.bool_)): vtype = dtypes.dtype_to_typeclass(type(fill_value)) @@ -322,24 +325,30 @@ def _numpy_full(pv: ProgramVisitor, is_data = True vtype = sdfg.arrays[fill_value].dtype dtype = dtype or vtype + + # Handle one-dimensional inputs + if isinstance(shape, (Number, str)) or symbolic.issymbolic(shape): + shape = [shape] + + if any(isinstance(s, str) for s in shape): + raise DaceSyntaxError( + pv, None, f'Data-dependent shape {shape} is currently not allowed. Only constants ' + 'and symbolic values can be used.') + name, _ = sdfg.add_temp_transient(shape, dtype) if is_data: state.add_mapped_tasklet( - '_numpy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, dict(__inp=dace.Memlet(data=fill_value, subset='0')), "__out = __inp", dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) else: state.add_mapped_tasklet( - '_numpy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, {}, + '_numpy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, {}, "__out = {}".format(fill_value), dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) @@ -459,10 +468,8 @@ def _numpy_flip(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, axis inpidx = ','.join([f'__i{i}' for i in range(ndim)]) outidx = ','.join([f'{s} - __i{i} - 1' if a else f'__i{i}' for i, (a, s) in enumerate(zip(axis, desc.shape))]) state.add_mapped_tasklet(name="_numpy_flip_", - map_ranges={ - f'__i{i}': f'0:{s}:1' - for i, s in enumerate(desc.shape) - }, + map_ranges={f'__i{i}': f'0:{s}:1' + for i, s in enumerate(desc.shape)}, inputs={'__inp': Memlet(f'{arr}[{inpidx}]')}, code='__out = __inp', outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')}, @@ -532,10 +539,8 @@ def _numpy_rot90(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, k=1 outidx = ','.join(out_indices) state.add_mapped_tasklet(name="_rot90_", - map_ranges={ - f'__i{i}': f'0:{s}:1' - for i, s in enumerate(desc.shape) - }, + map_ranges={f'__i{i}': f'0:{s}:1' + for i, s in enumerate(desc.shape)}, inputs={'__inp': Memlet(f'{arr}[{inpidx}]')}, code='__out = __inp', outputs={'__out': Memlet(f'{arr_copy}[{outidx}]')}, @@ -546,8 +551,13 @@ def _numpy_rot90(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, k=1 @oprepo.replaces('numpy.arange') @oprepo.replaces('dace.arange') -def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs): - """ Implementes numpy.arange """ +def _arange(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + *args, + dtype: dtypes.typeclass = None, + like: Optional[str] = None): + """ Implements numpy.arange """ start = 0 stop = None @@ -561,35 +571,42 @@ def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs): else: start, stop, step = args + if isinstance(start, str): + raise TypeError(f'Cannot compile numpy.arange with a scalar start value "{start}" (only constants and symbolic ' + 'expressions are supported). Please use numpy.linspace instead.') + if isinstance(stop, str): + raise TypeError(f'Cannot compile numpy.arange with a scalar stop value "{stop}" (only constants and symbolic ' + 'expressions are supported). Please use numpy.linspace instead.') + if isinstance(step, str): + raise TypeError(f'Cannot compile numpy.arange with a scalar step value "{step}" (only constants and symbolic ' + 'expressions are supported). Please use numpy.linspace instead.') + actual_step = step if isinstance(start, Number) and isinstance(stop, Number): actual_step = type(start + step)(start + step) - start if any(not isinstance(s, Number) for s in [start, stop, step]): - shape = (symbolic.int_ceil(stop - start, step), ) + if step == 1: # Common case where ceiling is not necessary + shape = (stop - start,) + else: + shape = (symbolic.int_ceil(stop - start, step), ) else: shape = (np.int64(np.ceil((stop - start) / step)), ) - if not isinstance(shape[0], Number) and ('dtype' not in kwargs or kwargs['dtype'] == None): - raise NotImplementedError("The current implementation of numpy.arange requires that the output dtype is given " - "when at least one of (start, stop, step) is symbolic.") + # Infer dtype from input arguments + if dtype is None: + dtype, _ = _result_type(args) + # TODO: Unclear what 'like' does - # if 'like' in kwargs and kwargs['like'] != None: - # outname, outarr = sdfg.add_temp_transient_like(sdfg.arrays[kwargs['like']]) + # if like is not None: + # outname, outarr = sdfg.add_temp_transient_like(sdfg.arrays[like]) # outarr.shape = shape - if 'dtype' in kwargs and kwargs['dtype'] != None: - dtype = kwargs['dtype'] - if not isinstance(dtype, dtypes.typeclass): - dtype = dtypes.dtype_to_typeclass(dtype) - outname, outarr = sdfg.add_temp_transient(shape, dtype) - else: - # infer dtype based on args's dtype - # (since the `dtype` keyword argument isn't given, none of the arguments can be symbolic) - if any(isinstance(arg, (float, np.float32, np.float64)) for arg in args): - dtype = dtypes.float64 - else: - dtype = dtypes.int64 - outname, outarr = sdfg.add_temp_transient(shape, dtype) + if not isinstance(dtype, dtypes.typeclass): + dtype = dtypes.dtype_to_typeclass(dtype) + outname, outarr = sdfg.add_temp_transient(shape, dtype) + + start = f'decltype(__out)({start})' + actual_step = f'decltype(__out)({actual_step})' state.add_mapped_tasklet(name="_numpy_arange_", map_ranges={'__i': f"0:{shape[0]}"}, @@ -601,6 +618,131 @@ def _arange(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, *args, **kwargs): return outname +def _add_axis_to_shape(shape: Sequence[symbolic.SymbolicType], axis: int, + axis_value: Any) -> List[symbolic.SymbolicType]: + if axis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + if axis < 0: + naxis = len(shape) + 1 + axis + if naxis < 0 or naxis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + axis = naxis + + # Make a new shape list with the inserted dimension + new_shape = [None] * (len(shape) + 1) + for i in range(len(shape) + 1): + if i == axis: + new_shape[i] = axis_value + elif i < axis: + new_shape[i] = shape[i] + else: + new_shape[i] = shape[i - 1] + + return new_shape + + +@oprepo.replaces('numpy.linspace') +def _linspace(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + start: Union[Number, symbolic.SymbolicType, str], + stop: Union[Number, symbolic.SymbolicType, str], + num: Union[Integral, symbolic.SymbolicType] = 50, + endpoint: bool = True, + retstep: bool = False, + dtype: dtypes.typeclass = None, + axis: int = 0): + """ Implements numpy.linspace """ + # Argument checks + if not isinstance(num, (Integral, sp.Basic)): + raise TypeError('numpy.linspace can only be compiled when the ``num`` argument is symbolic or constant.') + if not isinstance(axis, Integral): + raise TypeError('numpy.linspace can only be compiled when the ``axis`` argument is constant.') + + # Start and stop are broadcast together, then, a new dimension is added to axis (taken from ``ndim + 1``), + # along which the numbers are filled. + start_shape = sdfg.arrays[start].shape if (isinstance(start, str) and start in sdfg.arrays) else [] + stop_shape = sdfg.arrays[stop].shape if (isinstance(stop, str) and stop in sdfg.arrays) else [] + + shape, ranges, outind, ind1, ind2 = _broadcast_together(start_shape, stop_shape) + shape_with_axis = _add_axis_to_shape(shape, axis, num) + ranges_with_axis = _add_axis_to_shape(ranges, axis, ('__sind', f'0:{symbolic.symstr(num)}')) + if outind: + outind_with_axis = _add_axis_to_shape(outind.split(', '), axis, '__sind') + else: + outind_with_axis = ['__sind'] + + if dtype is None: + # Infer output type from start and stop + start_type = sdfg.arrays[start] if (isinstance(start, str) and start in sdfg.arrays) else start + stop_type = sdfg.arrays[stop] if (isinstance(stop, str) and stop in sdfg.arrays) else stop + + dtype, _ = _result_type((start_type, stop_type), 'Add') + + # From the NumPy documentation: The inferred dtype will never be an integer; float is chosen even if the + # arguments would produce an array of integers. + if dtype in (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16, dtypes.uint32, + dtypes.uint64): + dtype = dtypes.dtype_to_typeclass(float) + + outname, _ = sdfg.add_temp_transient(shape_with_axis, dtype) + + if endpoint == True: + num -= 1 + + # Fill in input memlets as necessary + inputs = {} + if isinstance(start, str) and start in sdfg.arrays: + index = f'[{ind1}]' if ind1 else '' + inputs['__start'] = Memlet(f'{start}{index}') + startcode = '__start' + else: + startcode = symbolic.symstr(start) + + if isinstance(stop, str) and stop in sdfg.arrays: + index = f'[{ind2}]' if ind2 else '' + inputs['__stop'] = Memlet(f'{stop}{index}') + stopcode = '__stop' + else: + stopcode = symbolic.symstr(stop) + + # Create tasklet code based on inputs + code = f'__out = {startcode} + __sind * decltype(__out)({stopcode} - {startcode}) / decltype(__out)({symbolic.symstr(num)})' + + state.add_mapped_tasklet(name="linspace", + map_ranges=ranges_with_axis, + inputs=inputs, + code=code, + outputs={'__out': Memlet(f"{outname}[{','.join(outind_with_axis)}]")}, + external_edges=True) + + if retstep == False: + return outname + + # Return step if requested + + # Handle scalar outputs + if not ranges: + ranges = [('__unused', '0:1')] + out_index = f'[{outind}]' + + if len(shape) > 0: + stepname, _ = sdfg.add_temp_transient(shape, dtype) + else: + stepname, _ = sdfg.add_scalar(sdfg.temp_data_name(), dtype, transient=True) + out_index = '[0]' + + state.add_mapped_tasklet( + 'retstep', + ranges, + copy.deepcopy(inputs), + f'__out = decltype(__out)({stopcode} - {startcode}) / decltype(__out)({symbolic.symstr(num)})', + {'__out': Memlet(f"{stepname}{out_index}")}, + external_edges=True) + + return outname, stepname + + @oprepo.replaces('elementwise') @oprepo.replaces('dace.elementwise') def _elementwise(pv: 'ProgramVisitor', @@ -644,7 +786,8 @@ def _elementwise(pv: 'ProgramVisitor', else: state.add_mapped_tasklet( name="_elementwise_", - map_ranges={f'__i{dim}': f'0:{N}' for dim, N in enumerate(inparr.shape)}, + map_ranges={f'__i{dim}': f'0:{N}' + for dim, N in enumerate(inparr.shape)}, inputs={'__inp': Memlet.simple(in_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))}, code=code, outputs={'__out': Memlet.simple(out_array, ','.join([f'__i{dim}' for dim in range(len(inparr.shape))]))}, @@ -694,10 +837,8 @@ def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype: else: state.add_mapped_tasklet( name=func, - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(inparr.shape) - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(inparr.shape)}, inputs={'__inp': Memlet.simple(inpname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))}, code='__out = {f}(__inp)'.format(f=func), outputs={'__out': Memlet.simple(outname, ','.join(['__i%d' % i for i in range(len(inparr.shape))]))}, @@ -707,9 +848,9 @@ def _simple_call(sdfg: SDFG, state: SDFGState, inpname: str, func: str, restype: def _complex_to_scalar(complex_type: dace.typeclass): - if complex_type is dace.complex64: + if complex_type == dace.complex64: return dace.float32 - elif complex_type is dace.complex128: + elif complex_type == dace.complex128: return dace.float64 else: return complex_type @@ -813,7 +954,8 @@ def _len_array(pv: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, a: str): return sdfg.arrays[a].shape[0] if a in sdfg.constants_prop: return len(sdfg.constants[a]) - raise TypeError(f'`len` is not supported for input "{a}" (type {type(a)})') + else: + return len(a) @oprepo.replaces('transpose') @@ -1046,27 +1188,22 @@ def _argminmax(pv: ProgramVisitor, code = "__init = _val_and_idx(val={}, idx=-1)".format( dtypes.min_value(a_arr.dtype) if func == 'max' else dtypes.max_value(a_arr.dtype)) - nest.add_state().add_mapped_tasklet(name="_arg{}_convert_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) if i != axis - }, - inputs={}, - code=code, - outputs={ - '__init': - Memlet.simple( - reduced_structs, - ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) - }, - external_edges=True) + nest.add_state().add_mapped_tasklet( + name="_arg{}_convert_".format(func), + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape) if i != axis}, + inputs={}, + code=code, + outputs={ + '__init': Memlet.simple(reduced_structs, + ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) + }, + external_edges=True) nest.add_state().add_mapped_tasklet( name="_arg{}_reduce_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape)}, inputs={'__in': Memlet.simple(a, ','.join('__i%d' % i for i in range(len(a_arr.shape))))}, code="__out = _val_and_idx(idx={}, val=__in)".format("__i%d" % axis), outputs={ @@ -1086,10 +1223,8 @@ def _argminmax(pv: ProgramVisitor, nest.add_state().add_mapped_tasklet( name="_arg{}_extract_".format(func), - map_ranges={ - '__i%d' % i: '0:%s' % n - for i, n in enumerate(a_arr.shape) if i != axis - }, + map_ranges={'__i%d' % i: '0:%s' % n + for i, n in enumerate(a_arr.shape) if i != axis}, inputs={ '__in': Memlet.simple(reduced_structs, ','.join('__i%d' % i for i in range(len(a_arr.shape)) if i != axis)) @@ -1212,10 +1347,9 @@ def _unop(sdfg: SDFG, state: SDFGState, op1: str, opcode: str, opname: str): opcode = 'not' name, _ = sdfg.add_temp_transient(arr1.shape, restype, arr1.storage) - state.add_mapped_tasklet("_%s_" % opname, { - '__i%d' % i: '0:%s' % s - for i, s in enumerate(arr1.shape) - }, {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, + state.add_mapped_tasklet("_%s_" % opname, {'__i%d' % i: '0:%s' % s + for i, s in enumerate(arr1.shape)}, + {'__in1': Memlet.simple(op1, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, '__out = %s __in1' % opcode, {'__out': Memlet.simple(name, ','.join(['__i%d' % i for i in range(len(arr1.shape))]))}, external_edges=True) @@ -1639,8 +1773,17 @@ def _result_type(arguments: Sequence[Union[str, Number, symbolic.symbol, sp.Basi else: # Operators with 3 or more arguments result_type = _np_result_type(dtypes_for_result) + coarse_result_type = None + if result_type in complex_types: + coarse_result_type = 3 # complex + elif result_type in float_types: + coarse_result_type = 2 # float + elif result_type in signed_types: + coarse_result_type = 1 # signed integer, bool + else: + coarse_result_type = 0 # unsigned integer for i, t in enumerate(coarse_types): - if t != result_type: + if t != coarse_result_type: casting[i] = _cast_str(result_type) return result_type, casting @@ -2519,6 +2662,13 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op code="__out = log1p(__in1)", reduce=None, initial=np.log1p.identity), + clip=dict(name="_numpy_clip_", + operator=None, + inputs=["__in_a", "__in_amin", "__in_amax"], + outputs=["__out"], + code="__out = min(max(__in_a, __in_amin), __in_amax)", + reduce=None, + initial=np.inf), sqrt=dict(name="_numpy_sqrt_", operator="Sqrt", inputs=["__in1"], @@ -4094,14 +4244,13 @@ def implement_ufunc_outer(visitor: ProgramVisitor, ast_node: ast.Call, sdfg: SDF @oprepo.replaces('numpy.reshape') -def reshape( - pv: ProgramVisitor, - sdfg: SDFG, - state: SDFGState, - arr: str, - newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], - order: StringLiteral = StringLiteral('C') -) -> str: +def reshape(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + arr: str, + newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], + order: StringLiteral = StringLiteral('C'), + strides: Optional[Any] = None) -> str: if isinstance(arr, (list, tuple)) and len(arr) == 1: arr = arr[0] desc = sdfg.arrays[arr] @@ -4115,10 +4264,11 @@ def reshape( # New shape and strides as symbolic expressions newshape = [symbolic.pystr_to_symbolic(s) for s in newshape] - if fortran_strides: - strides = [data._prod(newshape[:i]) for i in range(len(newshape))] - else: - strides = [data._prod(newshape[i + 1:]) for i in range(len(newshape))] + if strides is None: + if fortran_strides: + strides = [data._prod(newshape[:i]) for i in range(len(newshape))] + else: + strides = [data._prod(newshape[i + 1:]) for i in range(len(newshape))] newarr, newdesc = sdfg.add_view(arr, newshape, @@ -4316,10 +4466,8 @@ def _ndarray_fill(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, va shape = sdfg.arrays[arr].shape state.add_mapped_tasklet( '_numpy_fill_', - map_ranges={ - f"__i{dim}": f"0:{s}" - for dim, s in enumerate(shape) - }, + map_ranges={f"__i{dim}": f"0:{s}" + for dim, s in enumerate(shape)}, inputs=inputs, code=f"__out = {body}", outputs={'__out': dace.Memlet.simple(arr, ",".join([f"__i{dim}" for dim in range(len(shape))]))}, @@ -4335,9 +4483,13 @@ def _ndarray_reshape( sdfg: SDFG, state: SDFGState, arr: str, - newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], + *newshape: Union[str, symbolic.SymbolicType, Tuple[Union[str, symbolic.SymbolicType]]], order: StringLiteral = StringLiteral('C') ) -> str: + if len(newshape) == 0: + raise TypeError('reshape() takes at least 1 argument (0 given)') + if len(newshape) == 1 and isinstance(newshape, (list, tuple)): + newshape = newshape[0] return reshape(pv, sdfg, state, arr, newshape, order) @@ -4544,6 +4696,13 @@ def _ndarray_astype(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, arr: str, return _datatype_converter(sdfg, state, arr, dtype)[0] +@oprepo.replaces_operator('Array', 'MatMult', otherclass='StorageType') +def _cast_storage(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, arr: str, stype: dace.StorageType) -> str: + desc = sdfg.arrays[arr] + desc.storage = stype + return arr + + # Replacements that need ufuncs ############################################### # TODO: Fix by separating to different modules and importing @@ -4747,13 +4906,7 @@ def _tensordot(pv: 'ProgramVisitor', @oprepo.replaces("cupy._core.core.ndarray") @oprepo.replaces("cupy.ndarray") -def _define_cupy_local( - pv: "ProgramVisitor", - sdfg: SDFG, - state: SDFGState, - shape: Shape, - dtype: typeclass -): +def _define_cupy_local(pv: "ProgramVisitor", sdfg: SDFG, state: SDFGState, shape: Shape, dtype: typeclass): """Defines a local array in a DaCe program.""" if not isinstance(shape, (list, tuple)): shape = [shape] @@ -4781,10 +4934,8 @@ def _cupy_full(pv: ProgramVisitor, name, _ = sdfg.add_temp_transient(shape, dtype, storage=dtypes.StorageType.GPU_Global) state.add_mapped_tasklet( - '_cupy_full_', { - "__i{}".format(i): "0: {}".format(s) - for i, s in enumerate(shape) - }, {}, + '_cupy_full_', {"__i{}".format(i): "0: {}".format(s) + for i, s in enumerate(shape)}, {}, "__out = {}".format(fill_value), dict(__out=dace.Memlet.simple(name, ",".join(["__i{}".format(i) for i in range(len(shape))]))), external_edges=True) @@ -4843,3 +4994,407 @@ def _op(visitor: 'ProgramVisitor', sdfg: SDFG, state: SDFGState, op1: StringLite for op, method in _boolop_to_method.items(): _makeboolop(op, method) + + +@oprepo.replaces('numpy.concatenate') +def _concat(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + arrays: Tuple[Any], + axis: Optional[int] = 0, + out: Optional[Any] = None, + *, + dtype=None, + casting: str = 'same_kind'): + if dtype is not None and out is not None: + raise ValueError('Arguments dtype and out cannot be given together') + if casting != 'same_kind': + raise NotImplementedError('The casting argument is currently unsupported') + if not isinstance(arrays, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile concatenation') + if axis is not None and not isinstance(axis, Integral): + raise ValueError('Axis is not a compile-time evaluatable integer, cannot compile concatenation') + if len(arrays) == 1: + return arrays[0] + for i in range(len(arrays)): + if arrays[i] not in sdfg.arrays: + raise TypeError(f'Index {i} is not an array') + if out is not None: + if out not in sdfg.arrays: + raise TypeError('Output is not an array') + dtype = sdfg.arrays[out].dtype + + descs = [sdfg.arrays[arr] for arr in arrays] + shape = list(descs[0].shape) + + if axis is None: # Flatten arrays, then concatenate + arrays = [flat(visitor, sdfg, state, arr) for arr in arrays] + descs = [sdfg.arrays[arr] for arr in arrays] + shape = list(descs[0].shape) + axis = 0 + else: + # Check shapes for validity + first_shape = copy.copy(shape) + first_shape[axis] = 0 + for i, d in enumerate(descs[1:]): + other_shape = list(d.shape) + other_shape[axis] = 0 + if other_shape != first_shape: + raise ValueError(f'Array shapes do not match at index {i}') + + shape[axis] = sum(desc.shape[axis] for desc in descs) + if out is None: + if dtype is None: + dtype = descs[0].dtype + name, odesc = sdfg.add_temp_transient(shape, dtype, storage=descs[0].storage, lifetime=descs[0].lifetime) + else: + name = out + odesc = sdfg.arrays[out] + + # Make copies + w = state.add_write(name) + offset = 0 + subset = subsets.Range.from_array(odesc) + for arr, desc in zip(arrays, descs): + r = state.add_read(arr) + subset = copy.deepcopy(subset) + subset[axis] = (offset, offset + desc.shape[axis] - 1, 1) + state.add_edge(r, None, w, None, Memlet(data=name, subset=subset)) + offset += desc.shape[axis] + + return name + + +@oprepo.replaces('numpy.stack') +def _stack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + arrays: Tuple[Any], + axis: int = 0, + out: Any = None, + *, + dtype=None, + casting: str = 'same_kind'): + if dtype is not None and out is not None: + raise ValueError('Arguments dtype and out cannot be given together') + if casting != 'same_kind': + raise NotImplementedError('The casting argument is currently unsupported') + if not isinstance(arrays, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile stack call') + if not isinstance(axis, Integral): + raise ValueError('Axis is not a compile-time evaluatable integer, cannot compile stack call') + + for i in range(len(arrays)): + if arrays[i] not in sdfg.arrays: + raise TypeError(f'Index {i} is not an array') + + descs = [sdfg.arrays[a] for a in arrays] + shape = descs[0].shape + for i, d in enumerate(descs[1:]): + if d.shape != shape: + raise ValueError(f'Array shapes are not equal ({shape} != {d.shape} at index {i})') + + if axis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + if axis < 0: + naxis = len(shape) + 1 + axis + if naxis < 0 or naxis > len(shape): + raise ValueError(f'axis {axis} is out of bounds for array of dimension {len(shape)}') + axis = naxis + + # Stacking is implemented as a reshape followed by concatenation + reshaped = [] + for arr, desc in zip(arrays, descs): + # Make a reshaped view with the inserted dimension + new_shape = [0] * (len(shape) + 1) + new_strides = [0] * (len(shape) + 1) + for i in range(len(shape) + 1): + if i == axis: + new_shape[i] = 1 + new_strides[i] = desc.strides[i - 1] if i != 0 else desc.strides[i] + elif i < axis: + new_shape[i] = shape[i] + new_strides[i] = desc.strides[i] + else: + new_shape[i] = shape[i - 1] + new_strides[i] = desc.strides[i - 1] + + rname = reshape(visitor, sdfg, state, arr, new_shape, strides=new_strides) + reshaped.append(rname) + + return _concat(visitor, sdfg, state, reshaped, axis, out, dtype=dtype, casting=casting) + + +@oprepo.replaces('numpy.vstack') +@oprepo.replaces('numpy.row_stack') +def _vstack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + tup: Tuple[Any], + *, + dtype=None, + casting: str = 'same_kind'): + if not isinstance(tup, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile stack call') + if tup[0] not in sdfg.arrays: + raise TypeError(f'Index 0 is not an array') + + # In the 1-D case, stacking is performed along the first axis + if len(sdfg.arrays[tup[0]].shape) == 1: + return _stack(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting) + # Otherwise, concatenation is performed + return _concat(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting) + + +@oprepo.replaces('numpy.hstack') +@oprepo.replaces('numpy.column_stack') +def _hstack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + tup: Tuple[Any], + *, + dtype=None, + casting: str = 'same_kind'): + if not isinstance(tup, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile stack call') + if tup[0] not in sdfg.arrays: + raise TypeError(f'Index 0 is not an array') + + # In the 1-D case, concatenation is performed along the first axis + if len(sdfg.arrays[tup[0]].shape) == 1: + return _concat(visitor, sdfg, state, tup, axis=0, out=None, dtype=dtype, casting=casting) + + return _concat(visitor, sdfg, state, tup, axis=1, out=None, dtype=dtype, casting=casting) + + +@oprepo.replaces('numpy.dstack') +def _dstack(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + tup: Tuple[Any], + *, + dtype=None, + casting: str = 'same_kind'): + if not isinstance(tup, (tuple, list)): + raise ValueError('List of arrays is not iterable, cannot compile a stack call') + if tup[0] not in sdfg.arrays: + raise TypeError(f'Index 0 is not an array') + if len(sdfg.arrays[tup[0]].shape) < 3: + raise NotImplementedError('dstack is not implemented for arrays that are smaller than 3D') + + return _concat(visitor, sdfg, state, tup, axis=2, out=None, dtype=dtype, casting=casting) + + +def _split_core(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[int, Sequence[symbolic.SymbolicType], str], axis: int, allow_uneven: bool): + # Argument checks + if not isinstance(ary, str) or ary not in sdfg.arrays: + raise TypeError('Split object must be an array') + if not isinstance(axis, Integral): + raise ValueError('Cannot determine split dimension, axis is not a compile-time evaluatable integer') + + desc = sdfg.arrays[ary] + + # Test validity of axis + orig_axis = axis + if axis < 0: + axis = len(desc.shape) + axis + if axis < 0 or axis >= len(desc.shape): + raise ValueError(f'axis {orig_axis} is out of bounds for array of dimension {len(desc.shape)}') + + # indices_or_sections may only be an integer (not symbolic), list of integers, list of symbols, or an array + if isinstance(indices_or_sections, str): + raise ValueError('Array-indexed split cannot be compiled due to data-dependent sizes. ' + 'Consider using numpy.reshape instead.') + elif isinstance(indices_or_sections, (list, tuple)): + if any(isinstance(i, str) for i in indices_or_sections): + raise ValueError('Array-indexed split cannot be compiled due to data-dependent sizes. ' + 'Use symbolic values as an argument instead.') + # Sequence is given + sections = indices_or_sections + elif isinstance(indices_or_sections, Integral): # Constant integer given + if indices_or_sections <= 0: + raise ValueError('Number of sections must be larger than zero.') + + # If uneven sizes are not allowed and ary shape is numeric, check evenness + if not allow_uneven and not symbolic.issymbolic(desc.shape[axis]): + if desc.shape[axis] % indices_or_sections != 0: + raise ValueError('Array split does not result in an equal division. Consider using numpy.array_split ' + 'instead.') + if indices_or_sections > desc.shape[axis]: + raise ValueError('Cannot compile array split as it will result in empty arrays.') + + # Sequence is not given, compute sections + # Mimic behavior of array_split in numpy: Sections are [s+1 x N%s], s, ..., s + size = desc.shape[axis] // indices_or_sections + remainder = desc.shape[axis] % indices_or_sections + sections = [] + offset = 0 + for _ in range(min(remainder, indices_or_sections)): + offset += size + 1 + sections.append(offset) + for _ in range(remainder, indices_or_sections - 1): + offset += size + sections.append(offset) + + elif symbolic.issymbolic(indices_or_sections): + raise ValueError('Symbolic split cannot be compiled due to output tuple size being unknown. ' + 'Consider using numpy.reshape instead.') + else: + raise TypeError(f'Unsupported type {type(indices_or_sections)} for indices_or_sections in numpy.split') + + # Split according to sections + r = state.add_read(ary) + result = [] + offset = 0 + for section in sections: + shape = list(desc.shape) + shape[axis] = section - offset + name, _ = sdfg.add_temp_transient(shape, desc.dtype, storage=desc.storage, lifetime=desc.lifetime) + # Add copy + w = state.add_write(name) + subset = subsets.Range.from_array(desc) + subset[axis] = (offset, offset + shape[axis] - 1, 1) + offset += shape[axis] + state.add_nedge(r, w, Memlet(data=ary, subset=subset)) + result.append(name) + + # Add final section + shape = list(desc.shape) + shape[axis] -= offset + name, _ = sdfg.add_temp_transient(shape, desc.dtype, storage=desc.storage, lifetime=desc.lifetime) + w = state.add_write(name) + subset = subsets.Range.from_array(desc) + subset[axis] = (offset, offset + shape[axis] - 1, 1) + state.add_nedge(r, w, Memlet(data=ary, subset=subset)) + result.append(name) + + # Always return a list of results, even if the size is 1 + return result + + +@oprepo.replaces('numpy.split') +def _split(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str], + axis: int = 0): + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis, allow_uneven=False) + + +@oprepo.replaces('numpy.array_split') +def _array_split(visitor: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str], + axis: int = 0): + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis, allow_uneven=True) + + +@oprepo.replaces('numpy.dsplit') +def _dsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]): + if isinstance(ary, str) and ary in sdfg.arrays: + if len(sdfg.arrays[ary].shape) < 3: + raise ValueError('Array dimensionality must be 3 or above for dsplit') + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=2, allow_uneven=False) + + +@oprepo.replaces('numpy.hsplit') +def _hsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]): + if isinstance(ary, str) and ary in sdfg.arrays: + # In case of a 1D array, split with axis=0 + if len(sdfg.arrays[ary].shape) <= 1: + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=0, allow_uneven=False) + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=1, allow_uneven=False) + + +@oprepo.replaces('numpy.vsplit') +def _vsplit(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, ary: str, + indices_or_sections: Union[symbolic.SymbolicType, List[symbolic.SymbolicType], str]): + return _split_core(visitor, sdfg, state, ary, indices_or_sections, axis=0, allow_uneven=False) + + +############################################################################################################ +# Fast Fourier Transform numpy package (numpy.fft) + +def _real_to_complex(real_type: dace.typeclass): + if real_type == dace.float32: + return dace.complex64 + elif real_type == dace.float64: + return dace.complex128 + else: + return real_type + + +def _fft_core(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + a: str, + n: Optional[dace.symbolic.SymbolicType] = None, + axis=-1, + norm: StringLiteral = StringLiteral('backward'), + is_inverse: bool = False): + from dace.libraries.fft.nodes import FFT, IFFT # Avoid import loops + if axis != 0 and axis != -1: + raise NotImplementedError('Only one dimensional arrays are supported at the moment') + if not isinstance(a, str) or a not in sdfg.arrays: + raise ValueError('Input must be a valid array') + + libnode = FFT('fft') if not is_inverse else IFFT('ifft') + + desc = sdfg.arrays[a] + N = desc.shape[axis] + + # If n is not None, either pad input or slice and add a view + if n is not None: + raise NotImplementedError + + # Compute factor + if norm == 'forward': + factor = (1 / N) if not is_inverse else 1 + elif norm == 'backward': + factor = 1 if not is_inverse else (1 / N) + elif norm == 'ortho': + factor = sp.sqrt(1 / N) + else: + raise ValueError('norm argument can only be one of "forward", "backward", or "ortho".') + libnode.factor = factor + + # Compute output type from input type + if is_inverse and desc.dtype not in (dace.complex64, dace.complex128): + raise TypeError(f'Inverse FFT only accepts complex inputs, got {desc.dtype}') + dtype = _real_to_complex(desc.dtype) + + name, odesc = sdfg.add_temp_transient_like(desc, dtype) + r = state.add_read(a) + w = state.add_write(name) + state.add_edge(r, None, libnode, '_inp', Memlet.from_array(a, desc)) + state.add_edge(libnode, '_out', w, None, Memlet.from_array(name, odesc)) + + return name + + +@oprepo.replaces('numpy.fft.fft') +def _fft(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + a: str, + n: Optional[dace.symbolic.SymbolicType] = None, + axis=-1, + norm: StringLiteral = StringLiteral('backward')): + return _fft_core(pv, sdfg, state, a, n, axis, norm, False) + + +@oprepo.replaces('numpy.fft.ifft') +def _ifft(pv: 'ProgramVisitor', + sdfg: SDFG, + state: SDFGState, + a, + n=None, + axis=-1, + norm: StringLiteral = StringLiteral('backward')): + return _fft_core(pv, sdfg, state, a, n, axis, norm, True) diff --git a/dace/libraries/blas/nodes/gemv.py b/dace/libraries/blas/nodes/gemv.py index baf6fb415d..52091c6864 100644 --- a/dace/libraries/blas/nodes/gemv.py +++ b/dace/libraries/blas/nodes/gemv.py @@ -730,6 +730,9 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs): dtype_a = outer_array_a.dtype.type dtype = outer_array_x.dtype.base_type veclen = outer_array_x.dtype.veclen + alpha = f'{dtype.ctype}({node.alpha})' + beta = f'{dtype.ctype}({node.beta})' + m = m or node.m n = n or node.n if m is None: @@ -765,8 +768,17 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs): func = func.lower() + 'gemv' - code = f"""cblas_{func}({layout}, {trans}, {m}, {n}, {node.alpha}, _A, {lda}, - _x, {strides_x[0]}, {node.beta}, _y, {strides_y[0]});""" + code = '' + if dtype in (dace.complex64, dace.complex128): + code = f''' + {dtype.ctype} __alpha = {alpha}; + {dtype.ctype} __beta = {beta}; + ''' + alpha = '&__alpha' + beta = '&__beta' + + code += f"""cblas_{func}({layout}, {trans}, {m}, {n}, {alpha}, _A, {lda}, + _x, {strides_x[0]}, {beta}, _y, {strides_y[0]});""" tasklet = dace.sdfg.nodes.Tasklet(node.name, node.in_connectors, diff --git a/dace/libraries/fft/__init__.py b/dace/libraries/fft/__init__.py new file mode 100644 index 0000000000..71fb014f32 --- /dev/null +++ b/dace/libraries/fft/__init__.py @@ -0,0 +1,6 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +from dace.library import register_library +from .nodes import * +from .environments import * + +register_library(__name__, "fft") diff --git a/dace/libraries/fft/algorithms/__init__.py b/dace/libraries/fft/algorithms/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dace/libraries/fft/algorithms/dft.py b/dace/libraries/fft/algorithms/dft.py new file mode 100644 index 0000000000..340dfed22d --- /dev/null +++ b/dace/libraries/fft/algorithms/dft.py @@ -0,0 +1,45 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +""" +One-dimensional Discrete Fourier Transform (DFT) native implementations. +""" +import dace +import numpy as np +import math + + +# Native, naive version of the Discrete Fourier Transform +@dace.program +def dft(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + i = np.arange(N) + e = np.exp(-2j * np.pi * i * i[:, None] / N) + _out[:] = factor * (e @ _inp.astype(dace.complex128)) + + +@dace.program +def idft(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + i = np.arange(N) + e = np.exp(2j * np.pi * i * i[:, None] / N) + _out[:] = factor * (e @ _inp.astype(dace.complex128)) + + +# Single-map version of DFT, useful for integrating small Fourier transforms into other operations +@dace.program +def dft_explicit(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + _out[:] = 0 + for i, n in dace.map[0:N, 0:N]: + with dace.tasklet: + inp << _inp[n] + exponent = 2 * math.pi * i * n / N + b = decltype(b)(math.cos(exponent), -math.sin(exponent)) * inp * factor + b >> _out(1, lambda a, b: a + b)[i] + + +@dace.program +def idft_explicit(_inp, _out, N: dace.compiletime, factor: dace.compiletime): + _out[:] = 0 + for i, n in dace.map[0:N, 0:N]: + with dace.tasklet: + inp << _inp[n] + exponent = 2 * math.pi * i * n / N + b = decltype(b)(math.cos(exponent), math.sin(exponent)) * inp * factor + b >> _out(1, lambda a, b: a + b)[i] diff --git a/dace/libraries/fft/environments/__init__.py b/dace/libraries/fft/environments/__init__.py new file mode 100644 index 0000000000..0900214e68 --- /dev/null +++ b/dace/libraries/fft/environments/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +from .cufft import * diff --git a/dace/libraries/fft/environments/cufft.py b/dace/libraries/fft/environments/cufft.py new file mode 100644 index 0000000000..dd243d376a --- /dev/null +++ b/dace/libraries/fft/environments/cufft.py @@ -0,0 +1,21 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library + + +@dace.library.environment +class cuFFT: + + cmake_minimum_version = None + cmake_packages = ["CUDA"] + cmake_variables = {} + cmake_includes = [] + cmake_libraries = ["cufft"] + cmake_compile_flags = [] + cmake_link_flags = [] + cmake_files = [] + + headers = {'frame': ["cufft.h", "cufftXt.h"], 'cuda': ["cufft.h", "cufftXt.h"]} + state_fields = [] + init_code = "" + finalize_code = "" + dependencies = [] diff --git a/dace/libraries/fft/nodes/__init__.py b/dace/libraries/fft/nodes/__init__.py new file mode 100644 index 0000000000..dd8f132aa4 --- /dev/null +++ b/dace/libraries/fft/nodes/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +from .fft import FFT, IFFT diff --git a/dace/libraries/fft/nodes/fft.py b/dace/libraries/fft/nodes/fft.py new file mode 100644 index 0000000000..bc85f8785b --- /dev/null +++ b/dace/libraries/fft/nodes/fft.py @@ -0,0 +1,204 @@ +# Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. +""" +Implements Forward and Inverse Fast Fourier Transform (FFT) library nodes +""" +import warnings + +from dace import data, dtypes, SDFG, SDFGState, symbolic, library, nodes, properties +from dace import transformation as xf +from dace.libraries.fft import environments as env + + +# Define the library nodes +@library.node +class FFT(nodes.LibraryNode): + implementations = {} + default_implementation = 'pure' + + factor = properties.SymbolicProperty(desc='Coefficient to multiply outputs. Used for normalization', default=1.0) + + def __init__(self, name, *args, schedule=None, **kwargs): + super().__init__(name, *args, schedule=schedule, inputs={'_inp'}, outputs={'_out'}, **kwargs) + + +@library.node +class IFFT(nodes.LibraryNode): + implementations = {} + default_implementation = 'pure' + + factor = properties.SymbolicProperty(desc='Coefficient to multiply outputs. Used for normalization', default=1.0) + + def __init__(self, name, *args, schedule=None, **kwargs): + super().__init__(name, *args, schedule=schedule, inputs={'_inp'}, outputs={'_out'}, **kwargs) + + +################################################################################################## +# Native SDFG expansions +################################################################################################## + + +@library.register_expansion(FFT, 'pure') +class DFTExpansion(xf.ExpandTransformation): + environments = [] + + @staticmethod + def expansion(node: FFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + from dace.libraries.fft.algorithms import dft # Lazy import functions + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if len(indesc.shape) != 1: + raise NotImplementedError('Native SDFG expansion for FFT does not yet support N-dimensional inputs') + + warnings.warn('Performance Warning: No assumptions on FFT input size, falling back to DFT') + return dft.dft_explicit.to_sdfg(indesc, outdesc, N=indesc.shape[0], factor=node.factor) + + +@library.register_expansion(IFFT, 'pure') +class IDFTExpansion(xf.ExpandTransformation): + environments = [] + + @staticmethod + def expansion(node: IFFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + from dace.libraries.fft.algorithms import dft # Lazy import functions + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if len(indesc.shape) != 1: + raise NotImplementedError('Native SDFG expansion for IFFT does not yet support N-dimensional inputs') + + warnings.warn('Performance Warning: No assumptions on IFFT input size, falling back to DFT') + return dft.idft_explicit.to_sdfg(indesc, outdesc, N=indesc.shape[0], factor=node.factor) + + +################################################################################################## +# cuFFT expansions +################################################################################################## + + +@library.register_expansion(FFT, 'cuFFT') +class cuFFTFFTExpansion(xf.ExpandTransformation): + environments = [env.cuFFT] + plan_uid = 0 + + @staticmethod + def expansion(node: FFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if str(node.factor) != '1': + raise NotImplementedError('Multiplicative post-FFT factors are not yet implemented') + return _generate_cufft_code(indesc, outdesc, parent_sdfg, False) + + +@library.register_expansion(IFFT, 'cuFFT') +class cuFFTIFFTExpansion(xf.ExpandTransformation): + environments = [env.cuFFT] + plan_uid = 0 + + @staticmethod + def expansion(node: IFFT, parent_state: SDFGState, parent_sdfg: SDFG) -> SDFG: + input, output = _get_input_and_output(parent_state, node) + indesc = parent_sdfg.arrays[input] + outdesc = parent_sdfg.arrays[output] + if str(node.factor) != '1': + raise NotImplementedError('Multiplicative post-FFT factors are not yet implemented') + return _generate_cufft_code(indesc, outdesc, parent_sdfg, True) + + +def _generate_cufft_code(indesc: data.Data, outdesc: data.Data, sdfg: SDFG, is_inverse: bool): + from dace.codegen.targets import cpp # Avoid import loops + if len(indesc.shape) not in (1, 2, 3): + raise ValueError('cuFFT only supports 1/2/3-dimensional FFT') + if indesc.storage != dtypes.StorageType.GPU_Global: + raise ValueError('cuFFT implementation requires input array to be on GPU') + if outdesc.storage != dtypes.StorageType.GPU_Global: + raise ValueError('cuFFT implementation requires output array to be on GPU') + + cufft_type = _types_to_cufft(indesc.dtype, outdesc.dtype) + init_code = '' + exit_code = '' + callsite_code = '' + + # Make a unique name for this plan + if not is_inverse: + plan_name = f'fwdplan{cuFFTFFTExpansion.plan_uid}' + cuFFTFFTExpansion.plan_uid += 1 + direction = 'CUFFT_FORWARD' + tasklet_prefix = '' + else: + plan_name = f'invplan{cuFFTIFFTExpansion.plan_uid}' + cuFFTIFFTExpansion.plan_uid += 1 + direction = 'CUFFT_INVERSE' + tasklet_prefix = 'i' + + fields = [ + f'cufftHandle {plan_name};', + ] + plan_name = f'__state->{plan_name}' + + init_code += f''' + cufftCreate(&{plan_name}); + ''' + exit_code += f''' + cufftDestroy({plan_name}); + ''' + + cdims = ', '.join([cpp.sym2cpp(s) for s in indesc.shape]) + make_plan = f''' + {{ + size_t __work_size = 0; + cufftMakePlan{len(indesc.shape)}d({plan_name}, {cdims}, {cufft_type}, /*batch=*/1, &__work_size); + }} + ''' + + # Make plan in init if not symbolic or not data-dependent, otherwise make at callsite. + symbols_that_change = set(s for ise in sdfg.edges() for s in ise.data.assignments.keys()) + symbols_that_change &= set(map(str, sdfg.symbols.keys())) + + def _fsyms(x): + if symbolic.issymbolic(x): + return set(map(str, x.free_symbols)) + return set() + + if symbols_that_change and any(_fsyms(s) & symbols_that_change for s in indesc.shape): + callsite_code += make_plan + else: + init_code += make_plan + + # Execute plan + callsite_code += f''' + cufftSetStream({plan_name}, __dace_current_stream); + cufftXtExec({plan_name}, _inp, _out, {direction}); + ''' + + return nodes.Tasklet(f'cufft_{tasklet_prefix}fft', {'_inp'}, {'_out'}, + callsite_code, + language=dtypes.Language.CPP, + state_fields=fields, + code_init=init_code, + code_exit=exit_code) + + +################################################################################################## +# Helper functions +################################################################################################## + + +def _get_input_and_output(state: SDFGState, node: nodes.LibraryNode): + """ + Helper function that returns the input and output arrays of the library node + """ + in_edge = next(e for e in state.in_edges(node) if e.dst_conn) + out_edge = next(e for e in state.out_edges(node) if e.src_conn) + return in_edge.data.data, out_edge.data.data + + +def _types_to_cufft(indtype: dtypes.typeclass, outdtype: dtypes.typeclass): + typedict = { + dtypes.float32: 'R', + dtypes.float64: 'D', + dtypes.complex64: 'C', + dtypes.complex128: 'Z', + } + return f'CUFFT_{typedict[indtype]}2{typedict[outdtype]}' diff --git a/dace/libraries/standard/nodes/reduce.py b/dace/libraries/standard/nodes/reduce.py index fa231c07f2..970dfcef3a 100644 --- a/dace/libraries/standard/nodes/reduce.py +++ b/dace/libraries/standard/nodes/reduce.py @@ -103,7 +103,7 @@ def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): 'reduce_init', {'_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(outedge.data.subset.size())}, {}, '__out = %s' % node.identity, - {'__out': dace.Memlet.simple('_out', ','.join(['_o%d' % i for i in range(output_dims)]))}, + {'__out': dace.Memlet.simple('_out', ','.join(['_o%d' % i for i in osqdim]))}, external_edges=True) else: nstate = nsdfg.add_state() diff --git a/dace/libraries/standard/nodes/transpose.py b/dace/libraries/standard/nodes/transpose.py index 58c6cfc33e..e2795ef951 100644 --- a/dace/libraries/standard/nodes/transpose.py +++ b/dace/libraries/standard/nodes/transpose.py @@ -100,6 +100,12 @@ class ExpandTransposeMKL(ExpandTransformation): @staticmethod def expansion(node, state, sdfg): node.validate(sdfg, state) + + # Fall back to native implementation if input and output types are not the same + if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list( + state.out_edges_by_connector(node, '_out'))[0].data.data].dtype): + return ExpandTransposePure.make_sdfg(node, state, sdfg) + dtype = node.dtype if dtype == dace.float32: func = "somatcopy" @@ -141,22 +147,30 @@ class ExpandTransposeOpenBLAS(ExpandTransformation): @staticmethod def expansion(node, state, sdfg): node.validate(sdfg, state) + + # Fall back to native implementation if input and output types are not the same + if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list( + state.out_edges_by_connector(node, '_out'))[0].data.data].dtype): + return ExpandTransposePure.make_sdfg(node, state, sdfg) + dtype = node.dtype cast = "" if dtype == dace.float32: func = "somatcopy" alpha = "1.0f" + cast = '' elif dtype == dace.float64: func = "domatcopy" alpha = "1.0" + cast = '' elif dtype == dace.complex64: func = "comatcopy" - cast = "(float*)" - alpha = f"{cast}dace::blas::BlasConstants::Get().Complex64Pone()" + alpha = "dace::blas::BlasConstants::Get().Complex64Pone()" + cast = '(float*)' elif dtype == dace.complex128: func = "zomatcopy" - cast = "(double*)" - alpha = f"{cast}dace::blas::BlasConstants::Get().Complex128Pone()" + alpha = "dace::blas::BlasConstants::Get().Complex128Pone()" + cast = '(double*)' else: raise ValueError("Unsupported type for OpenBLAS omatcopy extension: " + str(dtype)) # TODO: Add stride support @@ -164,8 +178,8 @@ def expansion(node, state, sdfg): # Adaptations for BLAS API order = 'CblasRowMajor' trans = 'CblasTrans' - code = ("cblas_{f}({o}, {t}, {m}, {n}, {a}, {c}_inp, " - "{n}, {c}_out, {m});").format(f=func, o=order, t=trans, m=m, n=n, a=alpha, c=cast) + code = ("cblas_{f}({o}, {t}, {m}, {n}, {cast}{a}, {cast}_inp, " + "{n}, {cast}_out, {m});").format(f=func, o=order, t=trans, m=m, n=n, a=alpha, cast=cast) tasklet = dace.sdfg.nodes.Tasklet(node.name, node.in_connectors, node.out_connectors, @@ -184,6 +198,11 @@ def expansion(node, state, sdfg, **kwargs): node.validate(sdfg, state) dtype = node.dtype + # Fall back to native implementation if input and output types are not the same + if (sdfg.arrays[list(state.in_edges_by_connector(node, '_inp'))[0].data.data].dtype != sdfg.arrays[list( + state.out_edges_by_connector(node, '_out'))[0].data.data].dtype): + return ExpandTransposePure.make_sdfg(node, state, sdfg) + try: func, cdtype, factort = blas_helpers.cublas_type_metadata(dtype) except TypeError as ex: diff --git a/dace/memlet.py b/dace/memlet.py index f78da3a6b7..85bd0a348d 100644 --- a/dace/memlet.py +++ b/dace/memlet.py @@ -555,9 +555,9 @@ def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]: from dace.sdfg import nodes if isinstance(edge.dst, nodes.CodeNode) or isinstance(edge.src, nodes.CodeNode): view_edge = True - elif edge.dst_conn == 'views' and isinstance(edge.dst, nodes.AccessNode): + elif edge.dst_conn and isinstance(edge.dst, nodes.AccessNode): view_edge = True - elif edge.src_conn == 'views' and isinstance(edge.src, nodes.AccessNode): + elif edge.src_conn and isinstance(edge.src, nodes.AccessNode): view_edge = True if not view_edge: diff --git a/dace/properties.py b/dace/properties.py index 09439ce4f8..82be72f9fd 100644 --- a/dace/properties.py +++ b/dace/properties.py @@ -329,7 +329,7 @@ def initialize_properties(obj, *args, **kwargs): for name, prop in own_properties.items(): # Only assign our own properties, so we don't overwrite what's been # set by the base class - if hasattr(obj, name): + if hasattr(obj, '_' + name): raise PropertyError("Property {} already assigned in {}".format(name, type(obj).__name__)) if not prop.indirected: if prop.allow_none or prop.default is not None: diff --git a/dace/runtime/include/dace/math.h b/dace/runtime/include/dace/math.h index 0a9d153767..9cc131ee62 100644 --- a/dace/runtime/include/dace/math.h +++ b/dace/runtime/include/dace/math.h @@ -61,6 +61,45 @@ static DACE_CONSTEXPR DACE_HDFI T Mod(const T& value, const T2& modulus) { return value % modulus; } +// Fortran implements MOD for floating-point values as well +template +static DACE_CONSTEXPR DACE_HDFI T Mod_float(const T& value, const T& modulus) { + return value - static_cast(value / modulus) * modulus; +} + +// Fortran implementation of MODULO +template +static DACE_CONSTEXPR DACE_HDFI T Modulo(const T& value, const T& modulus) { + // Fortran implementation for integers - find R such that value = Q * modulus + R + // However, R must be in [0, modulus) + // To achieve that, we need to cast the division to floats. + // Example: -17, 3 must produce 1 and not -2. + // If we don't use cast, the floor is called on -5, producing wrong value. + // Instead, we need to have floor(-5.6... ) to ensure it produces -6. + // Similarly, 17, -3 must produce -1 and not 2. + // This means that the default solution works if value and modulus have the same sign. + return value - floor(static_cast(value) / modulus) * modulus; +} + +template +static DACE_CONSTEXPR DACE_HDFI T Modulo_float(const T& value, const T& modulus) { + return value - floor(value / modulus) * modulus; +} + +// Implement to support a match wtih Fortran's intrinsic EXPONENT +template::value>* = nullptr> +static DACE_CONSTEXPR DACE_HDFI int frexp(const T& a) { + int exponent = 0; + std::frexp(a, &exponent); + return exponent; +} + +// Implement to support Fortran's intrinsic NINT - round, but return an integer +template::value>* = nullptr> +static DACE_CONSTEXPR DACE_HDFI int iround(const T& a) { + return static_cast(round(a)); +} + template static DACE_CONSTEXPR DACE_HDFI T int_ceil(const T& numerator, const T2& denominator) { return (numerator + denominator - 1) / denominator; @@ -457,7 +496,7 @@ namespace dace { namespace math { - static DACE_CONSTEXPR typeless_pi pi{}; + static DACE_CONSTEXPR DACE_HostDev typeless_pi pi{}; static DACE_CONSTEXPR typeless_nan nan{}; ////////////////////////////////////////////////////// template diff --git a/dace/runtime/include/dace/nan.h b/dace/runtime/include/dace/nan.h index a8d1eb4c52..b4bac93980 100644 --- a/dace/runtime/include/dace/nan.h +++ b/dace/runtime/include/dace/nan.h @@ -13,101 +13,119 @@ namespace dace // Defines a typeless Pi struct typeless_nan { + DACE_CONSTEXPR DACE_HDFI typeless_nan() noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_nan(const typeless_nan&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_nan(typeless_nan&&) noexcept = default; + DACE_HDFI ~typeless_nan() noexcept = default; + +#ifndef DACE_XILINX + DACE_CONSTEXPR DACE_HDFI typeless_nan& operator=(const typeless_nan&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_nan& operator=(typeless_nan&&) noexcept = default; +#endif + operator int() const = delete; - operator float() const + DACE_CONSTEXPR DACE_HDFI operator float() const { return std::numeric_limits::quiet_NaN(); } - operator double() const + DACE_CONSTEXPR DACE_HDFI operator double() const { return std::numeric_limits::quiet_NaN(); } - operator long double() const + +#if !( defined(__CUDACC__) || defined(__HIPCC__) ) + //There is no long double on the GPU + DACE_CONSTEXPR DACE_HDFI operator long double() const { return std::numeric_limits::quiet_NaN(); } - typeless_nan operator+() const +#endif + DACE_CONSTEXPR DACE_HDFI typeless_nan operator+() const { return typeless_nan{}; } - typeless_nan operator-() const + DACE_CONSTEXPR DACE_HDFI typeless_nan operator-() const { return typeless_nan{}; } }; template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator*(const T&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator*(const T&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator*(const typeless_nan&, const T&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator*(const typeless_nan&, const T&) noexcept { return typeless_nan{}; } - inline typeless_nan - operator*(const typeless_nan&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI typeless_nan + operator*(const typeless_nan&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator+(const T&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator+(const T&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator+(const typeless_nan&, const T&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator+(const typeless_nan&, const T&) noexcept { return typeless_nan{}; } - inline typeless_nan - operator+(const typeless_nan&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI typeless_nan + operator+(const typeless_nan&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator-(const T&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator-(const T&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator-(const typeless_nan&, const T&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator-(const typeless_nan&, const T&) noexcept { return typeless_nan{}; } - inline typeless_nan - operator-(const typeless_nan&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI typeless_nan + operator-(const typeless_nan&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator/(const T&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator/(const T&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator/(const typeless_nan&, const T&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator/(const typeless_nan&, const T&) noexcept { return typeless_nan{}; } - inline typeless_nan - operator/(const typeless_nan&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI typeless_nan + operator/(const typeless_nan&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator%(const T&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator%(const T&, const typeless_nan&) noexcept { return typeless_nan{}; } template - DACE_CONSTEXPR typename std::enable_if::value, typeless_nan>::type - operator%(const typeless_nan&, const T&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI std::enable_if_t::value || std::is_integral::value, typeless_nan> + operator%(const typeless_nan&, const T&) noexcept { return typeless_nan{}; } - inline typeless_nan - operator%(const typeless_nan&, const typeless_nan&) { return typeless_nan{}; } + DACE_CONSTEXPR DACE_HDFI typeless_nan + operator%(const typeless_nan&, const typeless_nan&) noexcept { return typeless_nan{}; } - } -} + DACE_HDFI typeless_nan ipow(const typeless_nan&, const unsigned int&) { + return typeless_nan{}; + } //These functions allows to perfrom operations with `typeless_nan` instances. -# define FADAPT(F) DACE_CONSTEXPR ::dace::math::typeless_nan F (::dace::math::typeless_nan) { return ::dace::math::typeless_nan{}; } -# define FADAPT2(F) template DACE_CONSTEXPR dace::math::typeless_nan F (T1&&, dace::math::typeless_nan) { return ::dace::math::typeless_nan{}; }; \ - template DACE_CONSTEXPR dace::math::typeless_nan F (dace::math::typeless_nan, T2&&) { return ::dace::math::typeless_nan{}; }; \ - DACE_CONSTEXPR ::dace::math::typeless_nan F (dace::math::typeless_nan, dace::math::typeless_nan) { return ::dace::math::typeless_nan{}; } +# define FADAPT(F) DACE_CONSTEXPR DACE_HDFI typeless_nan F (const typeless_nan&) noexcept { return typeless_nan{}; } +# define FADAPT2(F) template DACE_CONSTEXPR DACE_HDFI typeless_nan F (T1&&, dace::math::typeless_nan) noexcept { return typeless_nan{}; }; \ + template DACE_CONSTEXPR DACE_HDFI typeless_nan F (const typeless_nan&, T2&&) noexcept { return typeless_nan{}; }; \ + DACE_CONSTEXPR DACE_HDFI typeless_nan F (const typeless_nan&, const typeless_nan&) noexcept { return typeless_nan{}; } FADAPT(tanh); FADAPT(cos); FADAPT(sin); FADAPT(sqrt); FADAPT(tan); FADAPT(acos); FADAPT(asin); FADAPT(atan); FADAPT(log); FADAPT(exp); FADAPT(floor); FADAPT(ceil); FADAPT(round); FADAPT(abs); FADAPT2(max); FADAPT2(min); # undef FADAPT2 # undef FADAPT + } +} + #endif // __DACE_NAN_H diff --git a/dace/runtime/include/dace/pi.h b/dace/runtime/include/dace/pi.h index 331b8c1636..818a22f6d0 100644 --- a/dace/runtime/include/dace/pi.h +++ b/dace/runtime/include/dace/pi.h @@ -2,6 +2,8 @@ #ifndef __DACE_PI_H #define __DACE_PI_H +#include + // Classes that are used to define a typeless Pi //#define _USE_MATH_DEFINES @@ -16,233 +18,274 @@ namespace dace { ////////////////////////////////////////////////////// // Defines a typeless Pi - struct typeless_pi + + template + struct is_typeless_pi { static constexpr bool value = false; }; + #define MAKE_TYPELESS_PI(type) template<> struct is_typeless_pi { static constexpr bool value = true; } + + struct typeless_pi; + + /* Represents $m * \pi$. */ + struct typeless_pi_mult { - double value() const { return M_PI; } - operator int() const - { - return int(this->value()); - } - operator float() const - { - return float(this->value()); - } - operator double() const - { - return double(this->value()); - } + int mult; + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult(int m): mult(m) {} + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult() noexcept: typeless_pi_mult(1) {}; + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult(const typeless_pi&) noexcept: typeless_pi_mult(1) {}; + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult(const typeless_pi_mult&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult(typeless_pi_mult&&) noexcept = default; + DACE_HDFI ~typeless_pi_mult() noexcept = default; + +#ifndef DACE_XILINX + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult& operator=(const typeless_pi_mult&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult& operator=(typeless_pi_mult&&) noexcept = default; +#endif + + template< + typename T, + typename = std::enable_if_t::value> + > + DACE_CONSTEXPR DACE_HDFI operator T() const noexcept + { return T(mult * M_PI); } + + DACE_CONSTEXPR DACE_HDFI operator float() const noexcept + { return float(mult * M_PI); } + + DACE_CONSTEXPR DACE_HDFI operator double() const noexcept + { return mult * M_PI; } + + DACE_CONSTEXPR DACE_HDFI operator long double() const noexcept + { return (long double)(mult * M_PI); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator+() const noexcept + { return *this; } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator-() const noexcept + { return typeless_pi_mult(-this->mult); } }; - struct typeless_pi_mult : typeless_pi + MAKE_TYPELESS_PI(typeless_pi_mult); + + /* Represents $\pi$ */ + struct typeless_pi { - int mult; typeless_pi_mult(int m = 1) : mult(m) {} - double value() const { return mult * M_PI; } - - operator int() const - { - return int(this->value()); - } - operator float() const - { - return float(this->value()); - } - operator double() const - { - return double(this->value()); - } + DACE_CONSTEXPR DACE_HDFI typeless_pi() noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_pi(const typeless_pi&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_pi(typeless_pi&&) noexcept = default; + DACE_HDFI ~typeless_pi() noexcept = default; +#ifndef DACE_XILINX + DACE_CONSTEXPR DACE_HDFI typeless_pi& operator=(const typeless_pi&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_pi& operator=(typeless_pi&&) noexcept = default; +#endif + + template< + typename T, + typename = std::enable_if_t::value> + > + DACE_CONSTEXPR DACE_HDFI operator T() const noexcept + { return T(M_PI); } + + DACE_CONSTEXPR DACE_HDFI operator float() const noexcept + { return float(M_PI); } + + DACE_CONSTEXPR DACE_HDFI operator double() const noexcept + { return M_PI; } + + DACE_CONSTEXPR DACE_HDFI operator long double() const noexcept + { return (long double)(M_PI); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi operator+() const noexcept + { return *this; } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator-() const noexcept + { return typeless_pi_mult(-1); } }; - struct typeless_pi_exp : typeless_pi_mult + MAKE_TYPELESS_PI(typeless_pi); + + /* Represents $m * \pi^{e}$ */ + struct typeless_pi_exp { - int mult, exp; typeless_pi_exp(int m = 1, int e = 1) : mult(m), exp(e) {} - double value() const { return mult * std::pow(M_PI, exp); } - operator int() const - { - return int(this->value()); - } - operator float() const - { - return float(this->value()); - } - operator double() const - { - return double(this->value()); - } + int mult, exp; + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp(int m, int e): mult(m), exp(e) {} + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp() noexcept: typeless_pi_exp(1, 1) {}; + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp(const typeless_pi_exp&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp(typeless_pi_exp&&) noexcept = default; + DACE_HDFI ~typeless_pi_exp() noexcept = default; + +#ifndef DACE_XILINX + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp& operator=(const typeless_pi_exp&) noexcept = default; + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp& operator=(typeless_pi_exp&&) noexcept = default; +#endif + + template< + typename T, + typename = std::enable_if_t::value> + > + DACE_CONSTEXPR DACE_HDFI operator T() const noexcept + { return T(mult * std::pow(static_cast(M_PI), exp)); } + + + /* We have to do the selection this way, because it seems as nvidia does + * not provide `powl` and `powf` in the std namespace */ + DACE_CONSTEXPR DACE_HDFI operator float() const + { using std::pow; return mult * pow(static_cast(M_PI), exp); } + + DACE_CONSTEXPR DACE_HDFI operator double() const + { using std::pow; return mult * std::pow(static_cast(M_PI), exp); } + +#if !( defined(__CUDACC__) || defined(__HIPCC__) ) + //There is no long double on the GPU + DACE_CONSTEXPR DACE_HDFI operator long double() const + { using std::pow; return mult * std::pow(static_cast(M_PI), exp); } +#endif + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator+() const + { return *this; } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator-() const + { return typeless_pi_exp(-this->mult, this->exp); } }; - inline typeless_pi_mult operator*(const typeless_pi&, const int& num) - { - return typeless_pi_mult(num); - } - inline typeless_pi_mult operator*(const typeless_pi_mult& p, const int& num) - { - return typeless_pi_mult(p.mult * num); - } - inline typeless_pi_exp operator*(const typeless_pi_exp& p, const int& num) - { - return typeless_pi_exp(p.mult * num, p.exp); - } - inline typeless_pi_mult operator*(const int& num, const typeless_pi&) - { - return typeless_pi_mult(num); - } - inline typeless_pi_mult operator*(const int& num, const typeless_pi_mult& p) - { - return typeless_pi_mult(num * p.mult); - } - inline typeless_pi_exp operator*(const int& num, const typeless_pi_exp& p) - { - return typeless_pi_exp(num * p.mult, p.exp); - } - template - T operator+(const typeless_pi& p, const T& num) - { - return T(p.value()) + num; - } - template - T operator-(const typeless_pi& p, const T& num) - { - return T(p.value()) - num; - } + MAKE_TYPELESS_PI(typeless_pi_exp); - template - T operator*(const typeless_pi& p, const T& num) - { - return T(p.value()) * num; - } - template - T operator/(const typeless_pi& p, const T& num) - { - return T(p.value()) / num; - } - template - T operator+(const T& num, const typeless_pi& p) - { - return num + T(p.value()); - } - template - T operator-(const T& num, const typeless_pi& p) - { - return num - T(p.value()); - } - template - T operator*(const T& num, const typeless_pi& p) - { - return num * T(p.value()); - } - template - T operator/(const T& num, const typeless_pi& p) - { - return num / T(p.value()); - } - template - T operator+(const typeless_pi_mult& p, const T& num) - { - return T(p.value()) + num; - } - template - T operator-(const typeless_pi_mult& p, const T& num) - { - return T(p.value()) - num; - } - template - T operator*(const typeless_pi_mult& p, const T& num) - { - return T(p.value()) * num; - } - template - T operator/(const typeless_pi_mult& p, const T& num) - { - return T(p.value()) / num; - } - template - T operator+(const T& num, const typeless_pi_mult& p) - { - return num + T(p.value()); - } - template - T operator-(const T& num, const typeless_pi_mult& p) - { - return num - T(p.value()); - } - template - T operator*(const T& num, const typeless_pi_mult& p) - { - return num * T(p.value()); - } - template - T operator/(const T& num, const typeless_pi_mult& p) - { - return num / T(p.value()); - } - template - T operator+(const typeless_pi_exp& p, const T& num) - { - return T(p.value()) + num; - } - template - T operator-(const typeless_pi_exp& p, const T& num) - { - return T(p.value()) - num; - } + DACE_CONSTEXPR DACE_HDFI int operator/(const typeless_pi&, const typeless_pi&) noexcept + { return 1; } - template - T operator*(const typeless_pi_exp& p, const T& num) - { - return T(p.value()) * num; - } - template - T operator/(const typeless_pi_exp& p, const T& num) - { - return T(p.value()) / num; - } - template - T operator+(const T& num, const typeless_pi_exp& p) - { - return num + T(p.value()); - } - template - T operator-(const T& num, const typeless_pi_exp& p) - { - return num - T(p.value()); - } - template - T operator*(const T& num, const typeless_pi_exp& p) - { - return num * T(p.value()); - } - template - T operator/(const T& num, const typeless_pi_exp& p) - { - return num / T(p.value()); - } - inline typeless_pi_mult operator-(const typeless_pi&) - { - return typeless_pi_mult(-1); - } - template - typeless_pi_mult operator+(const typeless_pi&, const typeless_pi&) - { - return typeless_pi_mult(2); - } - template - typeless_pi_mult operator+(const typeless_pi_mult& p1, const typeless_pi_mult& p2) - { - return typeless_pi_mult(p1.mult + p2.mult); - } - template - typeless_pi_exp operator*(const typeless_pi_mult& p1, const typeless_pi_mult& p2) - { - return typeless_pi_exp(p1.mult * p2.mult, 2); + DACE_CONSTEXPR DACE_HDFI int operator-(const typeless_pi&, const typeless_pi&) noexcept + { return 0; } + + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator*(const typeless_pi&, const int& num) noexcept + { return typeless_pi_mult(num); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator*(const int& num, const typeless_pi&) noexcept + { return typeless_pi_mult(num); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator*(const typeless_pi_mult& p, const int& num) noexcept + { return typeless_pi_mult(p.mult * num); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator*(const int& num, const typeless_pi_mult& p) noexcept + { return typeless_pi_mult(p.mult * num); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator+(const typeless_pi&, const typeless_pi&) noexcept + { return typeless_pi_mult(2); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator+(const typeless_pi&, const typeless_pi_mult& pi) noexcept + { return typeless_pi_mult(pi.mult + 1); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator+(const typeless_pi_mult& pi, const typeless_pi&) noexcept + { return typeless_pi_mult(pi.mult + 1); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator+(const typeless_pi_mult& pl, const typeless_pi_mult& pr) noexcept + { return typeless_pi_mult(pl.mult + pr.mult); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_mult operator-(const typeless_pi_mult& pl, const typeless_pi_mult& pr) noexcept + { return typeless_pi_mult(pl.mult - pr.mult); } + + DACE_CONSTEXPR DACE_HDFI int operator/(const typeless_pi_mult& pl, const typeless_pi&) noexcept + { return pl.mult; } + + DACE_CONSTEXPR DACE_HDFI double operator/(const typeless_pi& pl, const typeless_pi_mult& pr) noexcept + { return 1.0 / pr.mult; } + + + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi&, const typeless_pi&) noexcept + { return typeless_pi_exp(1, 2); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi_mult& pl, const typeless_pi_mult& pr) noexcept + { return typeless_pi_exp(pl.mult * pr.mult, 2); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi_mult& pl, const typeless_pi&) noexcept + { return typeless_pi_exp(pl.mult, 2); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi& pl, const typeless_pi_mult& pr) noexcept + { return typeless_pi_exp(pr.mult, 2); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi_exp& pl, const typeless_pi_mult& pr) noexcept + { return typeless_pi_exp(pl.mult * pr.mult, pl.exp + 1); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi_mult& pl, const typeless_pi_exp& pr) noexcept + { return pr * pl; } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi_exp& pl, const typeless_pi_exp& pr) noexcept + { return typeless_pi_exp(pl.mult * pr.mult, pr.exp + pl.exp); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const typeless_pi_exp& pl, const int& num) noexcept + { return typeless_pi_exp(pl.mult * num, pl.exp); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator*(const int& num, const typeless_pi_exp& pr) noexcept + { return typeless_pi_exp(pr.mult * num, pr.exp); } + + DACE_CONSTEXPR DACE_HDFI typeless_pi_exp operator/(const typeless_pi_exp& pl, const typeless_pi&) noexcept + { return typeless_pi_exp(pl.mult, pl.exp - 1); } + + + // The code generator guarantees us that `b > 0`. + DACE_HDFI typeless_pi_exp ipow(const typeless_pi_mult& pi, const unsigned int& b) { + return typeless_pi_exp(pow(pi.mult, b), b); } - template - typeless_pi_exp operator*(const typeless_pi&, const typeless_pi&) - { - return typeless_pi_exp(1, 2); + DACE_HDFI typeless_pi_exp ipow(const typeless_pi& pi, const unsigned int& b) { + return typeless_pi_exp(1, b); } - template - typeless_pi_exp operator*(const typeless_pi_exp& p1, const typeless_pi_exp& p2) - { - return typeless_pi_exp(p1.mult * p2.mult, p1.exp + p2.exp); + DACE_HDFI typeless_pi_exp ipow(const typeless_pi_exp& pi, const unsigned int& b) { + return typeless_pi_exp(pow(pi.mult, b), pi.exp * b); } + +# define DEF_PI_OPS(op) \ + template::value && (!is_typeless_pi::value)> > \ + DACE_CONSTEXPR DACE_HDFI T operator op (const T& lhs, const PI& pi) noexcept \ + { return lhs op (static_cast(pi)); } \ + template::value && (!is_typeless_pi::value)> > \ + DACE_CONSTEXPR DACE_HDFI T operator op (const PI& pi, const T& rhs) noexcept \ + { return (static_cast(pi)) op rhs; } + + DEF_PI_OPS(+); + DEF_PI_OPS(-); + DEF_PI_OPS(/); + DEF_PI_OPS(*); + + DACE_CONSTEXPR DACE_HDFI int sin(const typeless_pi&) noexcept + { return 0; } + + DACE_CONSTEXPR DACE_HDFI int sin(const typeless_pi_mult& pi) noexcept + { return 0; } + + DACE_HDFI double sin(const typeless_pi_exp& pi) noexcept + { return std::sin(static_cast(pi)); } + + DACE_CONSTEXPR DACE_HDFI int cos(const typeless_pi&) noexcept + { return 1; } + + DACE_CONSTEXPR DACE_HDFI int cos(const typeless_pi_mult& pi) noexcept + { return (pi.mult % 2 == 0) ? 1 : (-1); } + + DACE_HDFI double cos(const typeless_pi_exp& pi) noexcept + { return std::cos(static_cast(pi)); } + + +# define DEF_PI_TRIGO(F) \ + DACE_HDFI double F (const typeless_pi& pi) noexcept \ + { return std:: F( static_cast(pi) ); } \ + DACE_HDFI double F (const typeless_pi_mult& pi) noexcept \ + { return std:: F( static_cast(pi) ); } \ + DACE_HDFI double F (const typeless_pi_exp& pi) noexcept \ + { return std:: F( static_cast(pi) ); } + + DEF_PI_TRIGO(asin); + DEF_PI_TRIGO(acos); + DEF_PI_TRIGO(tan); + DEF_PI_TRIGO(atan); + DEF_PI_TRIGO(exp); + DEF_PI_TRIGO(log); + + +# undef DEF_PI_TRIGO +# undef DEF_PI_OPS +# undef MAKE_TYPELESS_PI } } diff --git a/dace/runtime/include/dace/reduction.h b/dace/runtime/include/dace/reduction.h index 927bf449de..81017610ae 100644 --- a/dace/runtime/include/dace/reduction.h +++ b/dace/runtime/include/dace/reduction.h @@ -205,15 +205,27 @@ namespace dace { #if defined(DACE_USE_GPU_ATOMICS) template <> - struct _wcr_fixed { + struct _wcr_fixed { - static DACE_HDFI long long reduce_atomic(long long *ptr, const long long& value) { + static DACE_HDFI int64_t reduce_atomic(int64_t *ptr, const int64_t& value) { return _wcr_fixed::reduce_atomic(( unsigned long long *)ptr, static_cast(value)); } - DACE_HDFI long long operator()(const long long &a, const long long &b) const { return a + b; } + DACE_HDFI int64_t operator()(const int64_t &a, const int64_t &b) const { return a + b; } + }; + + template <> + struct _wcr_fixed { + + static DACE_HDFI uint64_t reduce_atomic(uint64_t *ptr, const uint64_t& value) { + return _wcr_fixed::reduce_atomic(( + unsigned long long *)ptr, + static_cast(value)); + } + + DACE_HDFI uint64_t operator()(const uint64_t &a, const uint64_t &b) const { return a + b; } }; #endif diff --git a/dace/runtime/include/dace/stream.h b/dace/runtime/include/dace/stream.h index 255e16ec2b..1f8134fae6 100644 --- a/dace/runtime/include/dace/stream.h +++ b/dace/runtime/include/dace/stream.h @@ -338,7 +338,7 @@ namespace dace { template struct Consume { - template