Merge tag '2.2.1'

ROCm · Feb 26, 2020 · 4fa6197 · 4fa6197
2 parents 9fd3e57 + 9218683
commit 4fa6197
Show file tree

Hide file tree

Showing 38 changed files with 1,252 additions and 418 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -67,7 +67,7 @@ if(NOT WIN32 AND NOT APPLE)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
-rocm_setup_version(VERSION 2.2.0)
+rocm_setup_version(VERSION 2.2.1)
 
 list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
 include(TargetFlags)
@@ -207,8 +207,15 @@ endif()
 # Online assembler
 find_program(MIOPEN_AMDGCN_ASSEMBLER
     NAMES clang
-    PATHS ${MIOPEN_AMDGCN_ASSEMBLER_PATH} /opt/rocm
-    PATH_SUFFIXES /opencl/bin/x86_64
+    PATHS 
+        ${MIOPEN_AMDGCN_ASSEMBLER_PATH}
+        /opt/rocm
+        /opt/rocm/hcc
+        ${CMAKE_INSTALL_PREFIX}
+        ${CMAKE_INSTALL_PREFIX}/hcc
+    PATH_SUFFIXES 
+        /opencl/bin/x86_64
+        /bin
     NO_DEFAULT_PATH
 )
 message(STATUS "AMDGCN assembler: ${MIOPEN_AMDGCN_ASSEMBLER}")

diff --git a/README.md b/README.md
@@ -99,6 +99,8 @@ An example cmake step can be:
 CXX=/opt/rocm/hcc/bin/hcc cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/hcc;/opt/rocm/hip" ..
 ```
 
+Note: When specifying the path for the `CMAKE_PREFIX_PATH` variable, do not use the `~` shorthand for the user home directory.
+
 ### Setting Up Locations
 
 By default the install location is set to '/opt/rocm', this can be set by using `CMAKE_INSTALL_PREFIX`:

diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake
@@ -35,6 +35,7 @@ find_path(OPENCL_INCLUDE_DIRS
     /usr/local/cuda/include
     /opt/cuda/include
     /opt/rocm/opencl/include
+    ${CMAKE_INSTALL_PREFIX}/opencl/include
     DOC "OpenCL header file path"
     )
 mark_as_advanced( OPENCL_INCLUDE_DIRS )
@@ -53,6 +54,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
         /usr/local/cuda/lib
         /opt/cuda/lib
         /opt/rocm/opencl/lib
+        ${CMAKE_INSTALL_PREFIX}/opencl/lib
         )
 else( )
     find_library( OPENCL_LIBRARIES

diff --git a/doc/src/DebugAndLogging.md b/doc/src/DebugAndLogging.md
@@ -155,10 +155,14 @@ More information on logging with RocBlas can be found [here](https://github.com/
 
 ### Code Object (CO) version selection (EXPERIMENTAL)
 
-currently, ROCm fully supports Code Object version 2 (Co v2). The support for version 3 (CO v3) is being gradually introduced. These variables allows for experimenting and triaging problems related to CO version:
-* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Overrides CO version auto-detection implemented in the library. `0` or unset - disable overriding (the default), `1` - enforces CO v2, `2` - behave as if both CO v2 and v3 are supported, `2` - enforces CO v3.
-* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER` - This variable affects only Solutions available in both v2 and v3 code objects, and is intended to use only when ROCm supports both CO v2 and CO v3. By default, the older format is used (CO v2). When this variable is _enabled_, the behavior is reversed.
-* `MIOPEN_DEBUG_AMD_OPENCL_ENFORCE_COV3` - Enforces CO v3 for OpenCL kernels.
+Different ROCm versions use Code Object files of different versions (or, in other words, formats). The library uses suitable version automatically. The following variables allow for experimenting and triaging possible problems related to CO version:
+* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Affects kernels written in GCN assembly language.
+  * `0` or unset - Automatically detect the required CO version and assemble to that version. This is the default.
+  * `1` - Do not auto-detect Code Object version, always assemble v2 Code Objects.
+  * `2` - Behave as if both CO v2 and v3 are supported (see `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER`).
+  * `3` - Always assemble v3 Code Objects.
+* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER` - This variable affects only assembly kernels, and only when ROCm supports both CO v2 and CO v3 (like ROCm 2.10). By default, the newer format is used (CO v3). When this variable is _enabled_, the behavior is reversed.
+* `MIOPEN_DEBUG_AMD_OPENCL_ENFORCE_COV3` - Enforces CO v3 for OpenCL kernels. Works with HIP backend only (`cmake ... -DMIOPEN_BACKEND=HIP...`).
 
 ### `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_WORKSPACE_MAX`
 

diff --git a/doc/src/perfdatabase.md b/doc/src/perfdatabase.md
@@ -78,4 +78,4 @@ This variable allows for limiting the scope of `MIOPEN_FIND_ENFORCE`, so that on
 
 ### Updating MIOpen and the User Db
 
-It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from polution the configurations shipped with the newer system database. The user can find the file with the suffix `*.updb.txt` in the user perf db path.
+It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from poluting the configurations shipped with the newer system database. The user perf db is named `miopen.udb` and is located at the user perf db path.
diff --git a/doc/src/releasenotes.md b/doc/src/releasenotes.md
@@ -3,6 +3,22 @@
 
 
 
+### 01/24/2020 [ 2.2.1 ]
+
+- This release contains bug fixes, documentation updates, and further code object version 3 support
+
+
+Changes:
+
+- Added support for multiple ROCm installations
+- Added additional support for code object v3
+- Fixed issue with incorrect LRN calculation [#127](https://github.com/ROCmSoftwarePlatform/MIOpen/issues/127)
+- Fixed incorrect performance database documentation
+- Fixed issue with incorrect workspace calculation in group convolutions
+- Fixed issue with unsupported hardware instructions used with inline assembly
+
+
+
 ### 12/19/2019 [ 2.2.0 ]
 
 - This release contains bug fixes, performance improvements, and expanded applicability for specific convolutional algorithms.
@@ -29,7 +45,7 @@ Changes:
 
 
 
-### 09/25/2019 [2.1.0]
+### 09/25/2019 [ 2.1.0 ]
 
 - This release contains new layers, bug fixes, and a new convolution algorithm.
 

diff --git a/driver/mloNormHost.hpp b/driver/mloNormHost.hpp
@@ -71,6 +71,12 @@ int mloLRNForwardRunHost(bool do_scale,
 {
 
     int ret = 0;
+    if(local_area < 1 + pad)
+    {
+        std::cout << "ERROR: Lrn kernel size is insufficient." << std::endl;
+        return -1;
+    }
+
     if(norm_region == MLO_LRN_ACROSS_CHANNELS)
     {
         for(int b = 0; b < n_batchs; b++)
@@ -94,7 +100,7 @@ int mloLRNForwardRunHost(bool do_scale,
                         ++head;
                     }
                     // until we reach size, nothing needs to be subtracted
-                    while(head < local_area && head - pad >= 0 && head < n_inputs)
+                    while(head < local_area)
                     {
                         bot_val = (head < n_inputs)
                                       ? static_cast<_Tcheck>(
@@ -103,21 +109,21 @@ int mloLRNForwardRunHost(bool do_scale,
                                       : static_cast<_Tcheck>(0);
                         accum_scale += bot_val * bot_val;
                         _Tcheck scale = K + accum_scale * alphaoverarea;
-                        if((head - pad) >= 0 && do_scale)
+                        if((head - pad) >= 0 && (head - pad) < n_outputs && do_scale)
                         {
                             scale_v_ptr[b * scale_v_batch_stride +
                                         (head - pad) * scale_v_channel_stride + j * scale_v_stride +
                                         i] = scale;
                         }
                         bot_val =
-                            ((head - pad) >= 0)
+                            ((head - pad) >= 0 && (head - pad) < n_inputs)
                                 ? static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride +
                                                                (head - pad) * bot_channel_stride +
                                                                j * bot_stride + i])
                                 : static_cast<_Tcheck>(0);
                         _Tcheck s     = pow(scale, -beta);
                         _Tcheck c_val = bot_val * s;
-                        if((head - pad) >= 0)
+                        if((head - pad) >= 0 && (head - pad) < n_outputs)
                         {
                             top_v_ptr[b * top_v_batch_stride + (head - pad) * top_v_channel_stride +
                                       j * top_v_stride + i] = c_val;
@@ -209,8 +215,8 @@ int mloLRNForwardRunHost(bool do_scale,
                     {
                         // c-emulator
                         _Tcheck scale     = static_cast<_Tcheck>(0);
-                        int hstart        = j - pad;
-                        int wstart        = i - pad;
+                        int hstart        = j - (local_area - 1 - pad);
+                        int wstart        = i - (local_area - 1 - pad);
                         int hend          = std::min(hstart + local_area, bot_height + pad);
                         int wend          = std::min(wstart + local_area, bot_width + pad);
                         int adj_area_size = (hend - hstart) * (wend - wstart);
@@ -297,6 +303,12 @@ int mloLRNBackwardRunHost(int norm_region,
 
     int ret               = 0;
     _Tcheck negative_beta = -beta;
+    int pre_pad           = local_area - 1 - pad;
+    if(pre_pad < 0)
+    {
+        std::cout << "ERROR: Lrn kernel size is insufficient." << std::endl;
+        return -1;
+    }
 
     if(norm_region == MLO_LRN_ACROSS_CHANNELS)
     {
@@ -316,7 +328,7 @@ int mloLRNBackwardRunHost(int norm_region,
                     _Tcheck accum_ratio = static_cast<_Tcheck>(0);
 
                     // accumulate values
-                    while(head < pad)
+                    while(head < pre_pad)
                     {
                         if(head < n_inputs)
                         {
@@ -357,24 +369,24 @@ int mloLRNBackwardRunHost(int norm_region,
                             accum_ratio += adder;
                         }
 
-                        if(head - pad >= 0 && head - pad < n_inputs)
+                        if(head - pre_pad >= 0 && head - pre_pad < n_inputs)
                         {
                             bot_df_v_ptr[b * bot_df_v_batch_stride +
-                                         (head - pad) * bot_df_v_channel_stride +
+                                         (head - pre_pad) * bot_df_v_channel_stride +
                                          j * bot_df_v_stride + i] =
                                 static_cast<_Tcheck>(
                                     top_df_ptr[b * top_df_batch_stride +
-                                               (head - pad) * top_df_channel_stride +
+                                               (head - pre_pad) * top_df_channel_stride +
                                                j * top_df_stride + i]) *
                                     pow(static_cast<_Tcheck>(
                                             scale_ptr[b * scale_batch_stride +
-                                                      (head - pad) * scale_channel_stride +
+                                                      (head - pre_pad) * scale_channel_stride +
                                                       j * scale_stride + i]),
                                         negative_beta) -
-                                ratio_dta_bwd *
-                                    static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride +
-                                                                 (head - pad) * bot_channel_stride +
-                                                                 j * bot_stride + i]) *
+                                ratio_dta_bwd * static_cast<_Tcheck>(
+                                                    bot_ptr[b * bot_batch_stride +
+                                                            (head - pre_pad) * bot_channel_stride +
+                                                            j * bot_stride + i]) *
                                     accum_ratio;
                         }
                         ++head;
@@ -415,31 +427,31 @@ int mloLRNBackwardRunHost(int norm_region,
 
                             accum_ratio -= subs;
                         }
-                        if(head - pad >= 0)
+                        if(head - pre_pad >= 0)
                         {
                             bot_df_v_ptr[b * bot_df_v_batch_stride +
-                                         (head - pad) * bot_df_v_channel_stride +
+                                         (head - pre_pad) * bot_df_v_channel_stride +
                                          j * bot_df_v_stride + i] =
                                 static_cast<_Tcheck>(
                                     top_df_ptr[b * top_df_batch_stride +
-                                               (head - pad) * top_df_channel_stride +
+                                               (head - pre_pad) * top_df_channel_stride +
                                                j * top_df_stride + i]) *
                                     pow(static_cast<_Tcheck>(
                                             scale_ptr[b * scale_batch_stride +
-                                                      (head - pad) * scale_channel_stride +
+                                                      (head - pre_pad) * scale_channel_stride +
                                                       j * scale_stride + i]),
                                         negative_beta) -
-                                ratio_dta_bwd *
-                                    static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride +
-                                                                 (head - pad) * bot_channel_stride +
-                                                                 j * bot_stride + i]) *
+                                ratio_dta_bwd * static_cast<_Tcheck>(
+                                                    bot_ptr[b * bot_batch_stride +
+                                                            (head - pre_pad) * bot_channel_stride +
+                                                            j * bot_stride + i]) *
                                     accum_ratio;
                         }
 
                         ++head;
                     }
                     // subtract only
-                    while(head < n_inputs + pad)
+                    while(head < n_inputs + pre_pad)
                     {
                         if(head - local_area >= 0 && head - local_area < n_inputs)
                         {
@@ -459,24 +471,24 @@ int mloLRNBackwardRunHost(int norm_region,
 
                             accum_ratio -= subs;
                         }
-                        if(head - pad >= 0 && head - pad < n_inputs)
+                        if(head - pre_pad >= 0 && head - pre_pad < n_inputs)
                         {
                             bot_df_v_ptr[b * bot_df_v_batch_stride +
-                                         (head - pad) * bot_df_v_channel_stride +
+                                         (head - pre_pad) * bot_df_v_channel_stride +
                                          j * bot_df_v_stride + i] =
                                 static_cast<_Tcheck>(
                                     top_df_ptr[b * top_df_batch_stride +
-                                               (head - pad) * top_df_channel_stride +
+                                               (head - pre_pad) * top_df_channel_stride +
                                                j * top_df_stride + i]) *
                                     pow(static_cast<_Tcheck>(
                                             scale_ptr[b * scale_batch_stride +
-                                                      (head - pad) * scale_channel_stride +
+                                                      (head - pre_pad) * scale_channel_stride +
                                                       j * scale_stride + i]),
                                         negative_beta) -
-                                ratio_dta_bwd *
-                                    static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride +
-                                                                 (head - pad) * bot_channel_stride +
-                                                                 j * bot_stride + i]) *
+                                ratio_dta_bwd * static_cast<_Tcheck>(
+                                                    bot_ptr[b * bot_batch_stride +
+                                                            (head - pre_pad) * bot_channel_stride +
+                                                            j * bot_stride + i]) *
                                     accum_ratio;
                         }
 
@@ -502,8 +514,8 @@ int mloLRNBackwardRunHost(int norm_region,
 
                         int hstart        = j - pad;
                         int wstart        = i - pad;
-                        int hend          = std::min(hstart + local_area, top_height + pad);
-                        int wend          = std::min(wstart + local_area, top_width + pad);
+                        int hend          = std::min(hstart + local_area, top_height + pre_pad);
+                        int wend          = std::min(wstart + local_area, top_width + pre_pad);
                         int adj_area_size = (hend - hstart) * (wend - wstart);
                         hstart            = std::max(hstart, 0);
                         wstart            = std::max(wstart, 0);

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-RadeonOpenCompute/rocm-cmake@3f43e2d493f24abbab4dc189a9ab12cc3ad33baf --build
+RadeonOpenCompute/rocm-cmake@1abe21258481d4cf92f5bab0ef5956636c52f735 --build
 RadeonOpenCompute/clang-ocl@363b4f7ad8eb7b5104b9d5a3b8bf93f294d3ffae
 ROCmSoftwarePlatform/MIOpenGEMM@0eb1257cfaef83ea155aabd67af4437c0028db48
 ROCmSoftwarePlatform/rocBLAS@cbf0dd9a26b4406300d98dbc85088568c1532faf

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -46,7 +46,8 @@ message( STATUS "MIOpen_VERSION= ${MIOpen_VERSION}" )
 message( STATUS "CMAKE_BUILD_TYPE= ${CMAKE_BUILD_TYPE}" )
 
 # This is incremented when the ABI to the library changes
-set( MIOpen_SOVERSION 1 )
+set( MIOpen_SOVERSION 1.0 )
+
 
 function(add_kernels KERNEL_FILES)
     set(INIT_KERNELS_LIST)
@@ -238,7 +239,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/conv_3x3_wheel_alpha_v9_0_15.inc
         kernels/rocm_version.inc
         kernels/inst_wrappers.inc
-        kernels/common.inc
+        kernels/conv_common.inc
+        kernels/utilities.inc
         kernels/xform_data_filter.inc
         kernels/xform_kd_cov2.inc
         kernels/xform_metadata.inc
@@ -448,7 +450,7 @@ add_library( MIOpen
     ${MIOpen_Source}
     )
 
-set_target_properties(MIOpen PROPERTIES SOVERSION 1)
+rocm_set_soversion(MIOpen ${MIOpen_SOVERSION})
 
 clang_tidy_check(MIOpen)
 

diff --git a/src/include/miopen/mlo_internal.hpp b/src/include/miopen/mlo_internal.hpp
@@ -132,13 +132,11 @@ class rocm_meta_version
     int val = Unknown;
 
     public:
-    static constexpr int
-        Unknown     = 0, // Unset env.vars read as 0.
-        AMDHSA_COv2 = 1, // 1.0, see https://llvm.org/docs/AMDGPUUsage.html#code-object-metadata
-        AMDHSA_COv2_COv3 = 2, // E.g. ROCm 2.6 supports both.
-        AMDHSA_COv3      = 3,
-        Default =
-            AMDHSA_COv2; // Assumption for HIP backend. To be updated together with ROCm release.
+    static constexpr int Unknown = 0, // Unset env.vars read as 0.
+        AMDHSA_COv2              = 1, // V2 metadata, https://llvm.org/docs/AMDGPUUsage.html
+        AMDHSA_COv2_COv3         = 2, // E.g. ROCm 2.10 supports both.
+        AMDHSA_COv3              = 3, // V3 metadata, https://llvm.org/docs/AMDGPUUsage.html
+        Default                  = AMDHSA_COv2; // Used when auto-detection fails.
 
     private:
     static constexpr int End = 4, Begin = Unknown;
Original file line number	Diff line number	Diff line change
Expand Up		@@ -78,4 +78,4 @@ This variable allows for limiting the scope of `MIOPEN_FIND_ENFORCE`, so that on

		### Updating MIOpen and the User Db

		It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from polution the configurations shipped with the newer system database. The user can find the file with the suffix `*.updb.txt` in the user perf db path.
		It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from poluting the configurations shipped with the newer system database. The user perf db is named `miopen.udb` and is located at the user perf db path.