Skip to content

Commit

Permalink
Merge branch 'main-dev' into mvmc
Browse files Browse the repository at this point in the history
  • Loading branch information
xrq-phys committed Sep 22, 2021
2 parents 1fc23d2 + 4622f6a commit 0ecc66b
Show file tree
Hide file tree
Showing 27 changed files with 2,029 additions and 1,670 deletions.
115 changes: 115 additions & 0 deletions .github/workflows/auto-release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: Automatic release

on:
push:
tags:
- 'v*'
- 'sv*'

jobs:
create-rel:
name: BLIS release ${{ github.ref }}
runs-on: ubuntu-latest
steps:
- name: Create GH release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ github.ref }}
release_name: Automatic release.
body: |
Automatically generated by GitHub actions.
draft: false
prerelease: true
outputs:
upload-url: ${{ steps.create_release.outputs.upload_url }}

build-test-upload:
name: BLIS ${{ matrix.target.config }} - ${{ github.event_name }}
needs: create-rel
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
target:
- {
config: 'intel64',
cc: 'gcc',
cflags: '',
packages: 'gcc binutils'
}
- {
config: 'amd64',
cc: 'gcc',
cflags: '',
packages: 'gcc binutils'
}
# These Aarch64 configs would be merged after adding autodetection.
- {
config: 'cortexa57',
cc: 'aarch64-linux-gnu-gcc',
cflags: '-mno-outline-atomics',
packages: 'gcc-aarch64-linux-gnu libc6-dev-arm64-cross'
}
- {
config: 'armsve',
cc: 'aarch64-linux-gnu-gcc-10',
cflags: '-mno-outline-atomics',
packages: 'gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross'
}
- {
config: 'a64fx',
cc: 'aarch64-linux-gnu-gcc-10',
cflags: '"-mno-outline-atomics -DCACHE_SECTOR_SIZE_READONLY"',
packages: 'gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross'
}

steps:
- uses: actions/checkout@v2

- name: Install build dependencies
run: |
sudo apt update
sudo apt install -y ${{ matrix.target.packages }}
${{ matrix.target.cc }} --version
- name: Configure project
run: |
mkdir dest
./configure -t none -p ./dest \
CC=${{ matrix.target.cc }} \
CFLAGS=${{ matrix.target.cflags }} \
${{ matrix.target.config }}
- name: Build
run: |
make -j
- name: Quick check
if: startsWith(matrix.target.config, 'x86')
run: |
make -j checkblis-fast
- name: Install
run: |
make install
- name: Packup
run: |
cd dest
tar -zcvf ../libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz \
include lib
cd ..
- name: Upload GH release asset
id: upload-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.create-rel.outputs.upload-url }}
asset_path: ./libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz
asset_name: libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz
asset_content_type: application/x-tar
69 changes: 10 additions & 59 deletions config/a64fx/bli_cntx_init_a64fx.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,11 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);

Expand All @@ -67,11 +69,11 @@ void bli_cntx_init_a64fx( cntx_t* cntx )

// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 16, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, 10, 10 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 192, 96 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, 1536, 1536 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, 11520, 11760 );

// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
Expand All @@ -86,57 +88,6 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
cntx
);

#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );

// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);

// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);

// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );

// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif

// Set A64FX cache sector sizes for each PE/CMG
// SC Fugaku might disable users' setting cache sizes.
#if !defined(CACHE_SECTOR_SIZE_READONLY)
Expand Down
2 changes: 2 additions & 0 deletions config/armsve/bli_armsve_config_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,6 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \

EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
EXPANDMAC_BLKSZ_ARMSVE( d, 8 )
EXPANDMAC_BLKSZ_ARMSVE( c, 8 )
EXPANDMAC_BLKSZ_ARMSVE( z, 16 )

2 changes: 2 additions & 0 deletions config/armsve/bli_armsve_config_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,6 @@ dim_t bli_vl_bits_armsve(void);

void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_c_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
void bli_z_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);

72 changes: 14 additions & 58 deletions config/armsve/bli_cntx_init_armsve.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,23 @@ void bli_cntx_init_armsve( cntx_t* cntx )
// Block size.
dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
dim_t m_r_c, n_r_c, k_c_c, m_c_c, n_c_c;
dim_t m_r_z, n_r_z, k_c_z, m_c_z, n_c_z;
bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);

// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
4,
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);

Expand All @@ -84,11 +90,11 @@ void bli_cntx_init_armsve( cntx_t* cntx )

// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, m_r_c, m_r_z );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, n_r_c, n_r_z );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, m_c_c, m_c_z );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, k_c_c, k_c_z );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, n_c_c, n_c_z );

// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
Expand All @@ -103,55 +109,5 @@ void bli_cntx_init_armsve( cntx_t* cntx )
cntx
);

#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );

// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);

// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);

// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );

// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
}

6 changes: 3 additions & 3 deletions docs/Performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -550,9 +550,9 @@ The `runthese.m` file will contain example invocations of the function.
* Operating system: RHEL 8.3
* Page size: 256 bytes
* Compiler: gcc 10.1.0
* Results gathered: 2 April 2021; BLIS and SSL2 updated on 20 May 2021
* Results gathered: 2 April 2021; BLIS and SSL2 updated on 21 Sept 2021
* Implementations tested:
* BLIS 61584de (post-0.8.1)
* BLIS b05279d (post-0.8.1)
* configured with:
* `../configure -t none CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (single-threaded)
* `../configure -t openmp CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (multithreaded)
Expand All @@ -574,7 +574,7 @@ The `runthese.m` file will contain example invocations of the function.
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
* **NOTE**: While this version of ARMPL does provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm` (with the exception `dtrsm`), but these implementations yield very low performance, and their long run times led us to skip collecting these data altogether.
* Fujitsu SSL2 (Fujitsu toolchain 1.2.31)
* Fujitsu SSL2 (Fujitsu toolchain 1.2.33)
* Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1 NPARALLEL=1`
* Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12 NPARALLEL=12`
* Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48 NPARALLEL=48`
Expand Down
Binary file modified docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf
Binary file not shown.
Binary file modified docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf
Binary file not shown.
Binary file modified docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/graphs/large/l3_perf_a64fx_nt1.pdf
Binary file not shown.
Binary file modified docs/graphs/large/l3_perf_a64fx_nt1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 0ecc66b

Please sign in to comment.