From 8e07a956d2972b1b0b44a9d4ef1a1546e5b5f27a Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Mon, 18 Nov 2024 18:02:53 +0100 Subject: [PATCH 1/5] Fix Linux build issues Signed-off-by: Andreas Krebbel --- zdnn/aiu_quantized_matmul.c | 256 +++++------------------------------- zdnn/zdnn_private.h | 23 ++++ 2 files changed, 55 insertions(+), 224 deletions(-) diff --git a/zdnn/aiu_quantized_matmul.c b/zdnn/aiu_quantized_matmul.c index d15bd17..9036fa7 100644 --- a/zdnn/aiu_quantized_matmul.c +++ b/zdnn/aiu_quantized_matmul.c @@ -271,13 +271,8 @@ static void compute_bias(const zdnn_ztensor *input_c, const float scale, vec_char *in_c_vec = (vec_char *)((void *)((uintptr_t)input_c->buffer + in_c_offset)); -#if defined(__MVS__) vec_int16 *qc_tilde_vec = (vec_int16 *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#else - vec_short *qc_tilde_vec = - (vec_short *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#endif remaining_fields = input_c->transformed_desc->dim1 - e1x; fields_to_convert = MIN(remaining_fields, AIU_2BYTE_CELLS_PER_STICK); @@ -288,21 +283,12 @@ static void compute_bias(const zdnn_ztensor *input_c, const float scale, // Load high end of in_c_vec (first 8 elements) into temp_int16 vec_short temp_int16 = vec_unpackh(*in_c_vec); -#if defined(__MVS__) vec_fp32 temp_float_hi = vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset); vec_fp32 temp_float_lo = vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset); - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32( - vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset), - vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset), - 0); -#endif + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); nbr_fields_converted += 8; @@ -312,21 +298,12 @@ static void compute_bias(const zdnn_ztensor *input_c, const float scale, // Load low end of in_c_vec (final 8 elements) into temp_int16 temp_int16 = vec_unpackl(*in_c_vec); -#if defined(__MVS__) temp_float_hi = vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset); temp_float_lo = vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset); - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32( - vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset), - vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset), - 0); -#endif + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); // Push in_c_vec to the next 16 int8 elements in_c_vec++; @@ -341,35 +318,20 @@ static void compute_bias(const zdnn_ztensor *input_c, const float scale, // is AIU_PAGESIZE_IN_BYTES number of bytes away since dim3 and dim2 == 1 out_offset += AIU_PAGESIZE_IN_BYTES; -#if defined(__MVS__) qc_tilde_vec = (vec_int16 *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#else - qc_tilde_vec = - (vec_short *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#endif // Final AIU_2BYTE_CELLS_PER_STICK of AIU_1BYTE_CELLS_PER_STICK while (nbr_fields_converted < remaining_fields) { // Load high end of in_c_vec (first 8 elements) into temp_int16 vec_short temp_int16 = vec_unpackh(*in_c_vec); -#if defined(__MVS__) vec_fp32 temp_float_hi = vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset); vec_fp32 temp_float_lo = vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset); - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32( - vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset), - vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset), - 0); -#endif - + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); nbr_fields_converted += 8; if (nbr_fields_converted >= remaining_fields) @@ -378,21 +340,12 @@ static void compute_bias(const zdnn_ztensor *input_c, const float scale, // Load low end of in_c_vec (final 8 elements) into temp_int16 temp_int16 = vec_unpackl(*in_c_vec); -#if defined(__MVS__) temp_float_hi = vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset); temp_float_lo = vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset); - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32( - vec_madd(vec_float(vec_unpackh(temp_int16)), vec_scale, vec_offset), - vec_madd(vec_float(vec_unpackl(temp_int16)), vec_scale, vec_offset), - 0); -#endif + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); // Push in_c_vec to the next 16 int8 elements. in_c_vec++; @@ -541,13 +494,8 @@ static void compute_folded_bias(const zdnn_ztensor *input_b, vec_char *in_c_vec = (vec_char *)((void *)((uintptr_t)input_c->buffer + in_c_offset)); -#if defined(__MVS__) vec_int16 *qc_tilde_vec = (vec_int16 *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#else - vec_short *qc_tilde_vec = - (vec_short *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#endif remaining_fields = input_c->transformed_desc->dim1 - e1x; fields_to_convert = MIN(remaining_fields, AIU_2BYTE_CELLS_PER_STICK); @@ -606,13 +554,7 @@ static void compute_folded_bias(const zdnn_ztensor *input_b, temp_float_hi -= (vec_float(summ_vec_hi) * vec_MZa); temp_float_lo -= (vec_float(summ_vec_lo) * vec_MZa); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; @@ -672,13 +614,7 @@ static void compute_folded_bias(const zdnn_ztensor *input_b, temp_float_hi -= (vec_float(summ_vec_hi) * vec_MZa); temp_float_lo -= (vec_float(summ_vec_lo) * vec_MZa); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; @@ -694,13 +630,8 @@ static void compute_folded_bias(const zdnn_ztensor *input_b, in_b_offset = in_b_w_offset + in_b_bytes_all_w; out_offset += AIU_PAGESIZE_IN_BYTES; -#if defined(__MVS__) qc_tilde_vec = (vec_int16 *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#else - qc_tilde_vec = - (vec_short *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#endif // Final AIU_2BYTE_CELLS_PER_STICK of AIU_1BYTE_CELLS_PER_STICK while (nbr_fields_converted < remaining_fields) { @@ -755,14 +686,7 @@ static void compute_folded_bias(const zdnn_ztensor *input_b, temp_float_hi -= (vec_float(summ_vec_hi) * vec_MZa); temp_float_lo -= (vec_float(summ_vec_lo) * vec_MZa); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif - + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; nbr_fields_converted += 8; @@ -821,14 +745,7 @@ static void compute_folded_bias(const zdnn_ztensor *input_b, temp_float_hi -= (vec_float(summ_vec_hi) * vec_MZa); temp_float_lo -= (vec_float(summ_vec_lo) * vec_MZa); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif - + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; // Push in_c_vec to the next 16 int8 elements. @@ -965,13 +882,8 @@ static void compute_comparison_bias(const zdnn_ztensor *input_b, vec_char *in_c_vec = (vec_char *)((void *)((uintptr_t)input_c->buffer + in_c_offset)); -#if defined(__MVS__) vec_int16 *qc_tilde_vec = (vec_int16 *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#else - vec_short *qc_tilde_vec = - (vec_short *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#endif remaining_fields = input_c->transformed_desc->dim1 - e1x; fields_to_convert = MIN(remaining_fields, AIU_2BYTE_CELLS_PER_STICK); @@ -1030,14 +942,7 @@ static void compute_comparison_bias(const zdnn_ztensor *input_b, temp_float_hi += (vec_float(summ_vec_hi) * vec_Za); temp_float_lo += (vec_float(summ_vec_lo) * vec_Za); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif - + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; nbr_fields_converted += 8; @@ -1100,14 +1005,7 @@ static void compute_comparison_bias(const zdnn_ztensor *input_b, temp_float_hi += (vec_float(summ_vec_hi) * vec_Za); temp_float_lo += (vec_float(summ_vec_lo) * vec_Za); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif - + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; // Push in_c_vec to the next 16 int8 elements. @@ -1122,13 +1020,8 @@ static void compute_comparison_bias(const zdnn_ztensor *input_b, in_b_offset = in_b_w_offset + in_b_bytes_all_w; out_offset += AIU_PAGESIZE_IN_BYTES; -#if defined(__MVS__) qc_tilde_vec = (vec_int16 *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#else - qc_tilde_vec = - (vec_short *)((void *)((uintptr_t)qc_tilde->buffer + out_offset)); -#endif // Final AIU_2BYTE_CELLS_PER_STICK of AIU_1BYTE_CELLS_PER_STICK while (nbr_fields_converted < remaining_fields) { @@ -1183,14 +1076,7 @@ static void compute_comparison_bias(const zdnn_ztensor *input_b, temp_float_hi += (vec_float(summ_vec_hi) * vec_Za); temp_float_lo += (vec_float(summ_vec_lo) * vec_Za); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif - + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; nbr_fields_converted += 8; @@ -1249,14 +1135,7 @@ static void compute_comparison_bias(const zdnn_ztensor *input_b, temp_float_hi += (vec_float(summ_vec_hi) * vec_Za); temp_float_lo += (vec_float(summ_vec_lo) * vec_Za); -#if defined(__MVS__) - *qc_tilde_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_float_hi), - *(vec_float32 *)((void *)&temp_float_lo)); -#else - *qc_tilde_vec++ = vec_round_from_fp32(temp_float_hi, temp_float_lo, 0); -#endif - + *qc_tilde_vec++ = VEC_ROUND_FROM_FP32(temp_float_hi, temp_float_lo); in_b_offset += 16; // Push in_c_vec to the next 16 int8 elements. @@ -1500,39 +1379,22 @@ static void apply_clipping(const int8_t clip_min, const int8_t clip_max, for (uint32_t e1x = 0; e1x < output->transformed_desc->dim1; e1x += AIU_2BYTE_CELLS_PER_STICK) { -#if defined(__MVS__) + vec_int16 *output_vec = (vec_int16 *)((void *)((uintptr_t)output->buffer + out_offset)); -#else - vec_short *output_vec = - (vec_short *)((void *)((uintptr_t)output->buffer + out_offset)); -#endif fields_to_convert = MIN(output->transformed_desc->dim1 - e1x, AIU_2BYTE_CELLS_PER_STICK); nbr_fields_converted = 0; while (nbr_fields_converted < fields_to_convert) { -#if defined(__MVS__) vec_fp32 temp_vec_hi, temp_vec_lo; - aiu_vec_lengthen_to_fp32(*output_vec, - (vec_float32 *)((void *)&temp_vec_hi), - (vec_float32 *)((void *)&temp_vec_lo)); -#else - vec_fp32 temp_vec_hi = vec_extend_to_fp32_hi(*output_vec, 0); - vec_fp32 temp_vec_lo = vec_extend_to_fp32_lo(*output_vec, 0); -#endif + VEC_LENGTHEN_TO_FP32(*output_vec, temp_vec_hi, temp_vec_lo); (*clip_round_hi_func)(&temp_vec_hi, &vec_clip_min, &vec_clip_max); (*clip_round_lo_func)(&temp_vec_lo, &vec_clip_min, &vec_clip_max); (*deq_func)(&temp_vec_hi, &temp_vec_lo, vec_scale, vec_offset); -#if defined(__MVS__) - *output_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_vec_hi), - *(vec_float32 *)((void *)&temp_vec_lo)); -#else - *output_vec++ = vec_round_from_fp32(temp_vec_hi, temp_vec_lo, 0); -#endif + *output_vec++ = VEC_ROUND_FROM_FP32(temp_vec_hi, temp_vec_lo); nbr_fields_converted += 8; } @@ -1774,28 +1636,17 @@ static void apply_correction_term(const zdnn_ztensor *input_a, for (uint32_t e1x = 0; e1x < output->transformed_desc->dim1; e1x += AIU_2BYTE_CELLS_PER_STICK) { -#if defined(__MVS__) + vec_int16 *output_vec = (vec_int16 *)((void *)((uintptr_t)output->buffer + out_offset)); -#else - vec_short *output_vec = - (vec_short *)((void *)((uintptr_t)output->buffer + out_offset)); -#endif fields_to_convert = MIN(output->transformed_desc->dim1 - e1x, AIU_2BYTE_CELLS_PER_STICK); nbr_fields_converted = 0; while (nbr_fields_converted < fields_to_convert) { -#if defined(__MVS__) vec_fp32 temp_vec_hi, temp_vec_lo; - aiu_vec_lengthen_to_fp32(*output_vec, - (vec_float32 *)((void *)&temp_vec_hi), - (vec_float32 *)((void *)&temp_vec_lo)); -#else - vec_fp32 temp_vec_hi = vec_extend_to_fp32_hi(*output_vec, 0); - vec_fp32 temp_vec_lo = vec_extend_to_fp32_lo(*output_vec, 0); -#endif + VEC_LENGTHEN_TO_FP32(*output_vec, temp_vec_hi, temp_vec_lo); temp_vec_hi -= (*term_b_vec + term_a_vec); (*clip_round_hi_func)(&temp_vec_hi, &vec_clip_min, &vec_clip_max); term_b_vec++; @@ -1803,14 +1654,7 @@ static void apply_correction_term(const zdnn_ztensor *input_a, (*clip_round_lo_func)(&temp_vec_lo, &vec_clip_min, &vec_clip_max); term_b_vec++; (*deq_func)(&temp_vec_hi, &temp_vec_lo, vec_scale, vec_offset); - -#if defined(__MVS__) - *output_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_vec_hi), - *(vec_float32 *)((void *)&temp_vec_lo)); -#else - *output_vec++ = vec_round_from_fp32(temp_vec_hi, temp_vec_lo, 0); -#endif + *output_vec++ = VEC_ROUND_FROM_FP32(temp_vec_hi, temp_vec_lo); nbr_fields_converted += 8; } @@ -1912,13 +1756,9 @@ static void apply_correction_term_on_the_fly( for (uint32_t e1x = 0; e1x < input_a->transformed_desc->dim1; e1x += AIU_2BYTE_CELLS_PER_STICK) { -#if defined(__MVS__) + vec_int16 *in_a_vec = (vec_int16 *)((void *)((uintptr_t)input_a->buffer + in_a_offset)); -#else - vec_short *in_a_vec = - (vec_short *)((void *)((uintptr_t)input_a->buffer + in_a_offset)); -#endif remaining_fields = MIN(input_a->transformed_desc->dim1 - e1x, AIU_2BYTE_CELLS_PER_STICK); @@ -1926,42 +1766,28 @@ static void apply_correction_term_on_the_fly( nbr_fields_converted = 0; while (nbr_fields_converted < fields_to_convert) { -#if defined(__MVS__) - vec_float32 temp_float_hi, temp_float_lo; - aiu_vec_lengthen_to_fp32(*in_a_vec, &temp_float_hi, &temp_float_lo); + vec_fp32 temp_float_hi, temp_float_lo; + VEC_LENGTHEN_TO_FP32(*in_a_vec, temp_float_hi, temp_float_lo); - summ_vec_a_hi += *(vec_fp32 *)((void *)&temp_float_hi); - summ_vec_a_lo += *(vec_fp32 *)((void *)&temp_float_lo); -#else - summ_vec_a_hi += vec_extend_to_fp32_hi(*in_a_vec, 0); - summ_vec_a_lo += vec_extend_to_fp32_lo(*in_a_vec, 0); -#endif + summ_vec_a_hi += temp_float_hi; + summ_vec_a_lo += temp_float_lo; in_a_vec++; nbr_fields_converted += 8; } if (nbr_fields_converted < remaining_fields) { -#if defined(__MVS__) + // Load remaining fields_to_convert into temp_vec vec_int16 temp_vec = vec_load_len((uint16_t *)in_a_vec, (remaining_fields - nbr_fields_converted) * 2 - 1); - vec_float32 temp_float_hi, temp_float_lo; - aiu_vec_lengthen_to_fp32(temp_vec, &temp_float_hi, &temp_float_lo); - - summ_vec_a_hi += *(vec_fp32 *)((void *)&temp_float_hi); - summ_vec_a_lo += *(vec_fp32 *)((void *)&temp_float_lo); -#else - // Load remaining fields into temp_vec - vec_short temp_vec = - vec_load_len((short *)in_a_vec, - (remaining_fields - nbr_fields_converted) * 2 - 1); + vec_fp32 temp_float_hi, temp_float_lo; + VEC_LENGTHEN_TO_FP32(temp_vec, temp_float_hi, temp_float_lo); - summ_vec_a_hi += vec_extend_to_fp32_hi(temp_vec, 0); - summ_vec_a_lo += vec_extend_to_fp32_lo(temp_vec, 0); -#endif + summ_vec_a_hi += temp_float_hi; + summ_vec_a_lo += temp_float_lo; } in_a_offset += in_a_bytes_all_w; @@ -2065,28 +1891,17 @@ static void apply_correction_term_on_the_fly( for (uint32_t e1x = 0; e1x < output->transformed_desc->dim1; e1x += AIU_2BYTE_CELLS_PER_STICK) { -#if defined(__MVS__) vec_int16 *output_vec = (vec_int16 *)((void *)((uintptr_t)output->buffer + out_offset)); -#else - vec_short *output_vec = - (vec_short *)((void *)((uintptr_t)output->buffer + out_offset)); -#endif fields_to_convert = MIN(output->transformed_desc->dim1 - e1x, AIU_2BYTE_CELLS_PER_STICK); nbr_fields_converted = 0; while (nbr_fields_converted < fields_to_convert) { -#if defined(__MVS__) + vec_fp32 temp_vec_hi, temp_vec_lo; - aiu_vec_lengthen_to_fp32(*output_vec, - (vec_float32 *)((void *)&temp_vec_hi), - (vec_float32 *)((void *)&temp_vec_lo)); -#else - vec_fp32 temp_vec_hi = vec_extend_to_fp32_hi(*output_vec, 0); - vec_fp32 temp_vec_lo = vec_extend_to_fp32_lo(*output_vec, 0); -#endif + VEC_LENGTHEN_TO_FP32(*output_vec, temp_vec_hi, temp_vec_lo); temp_vec_hi -= (*term_b_vec + term_a_vec); (*clip_round_hi_func)(&temp_vec_hi, &vec_clip_min, &vec_clip_max); term_b_vec++; @@ -2094,14 +1909,7 @@ static void apply_correction_term_on_the_fly( (*clip_round_lo_func)(&temp_vec_lo, &vec_clip_min, &vec_clip_max); term_b_vec++; (*deq_func)(&temp_vec_hi, &temp_vec_lo, vec_scale, vec_offset); - -#if defined(__MVS__) - *output_vec++ = - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&temp_vec_hi), - *(vec_float32 *)((void *)&temp_vec_lo)); -#else - *output_vec++ = vec_round_from_fp32(temp_vec_hi, temp_vec_lo, 0); -#endif + *output_vec++ = VEC_ROUND_FROM_FP32(temp_vec_hi, temp_vec_lo); nbr_fields_converted += 8; } @@ -2585,4 +2393,4 @@ aiu_quantized_matmul(uint16_t op_parm_block_version, #else return ZDNN_STATUS_OK; #endif -} \ No newline at end of file +} diff --git a/zdnn/zdnn_private.h b/zdnn/zdnn_private.h index 3e16caa..806e8cf 100644 --- a/zdnn/zdnn_private.h +++ b/zdnn/zdnn_private.h @@ -589,6 +589,29 @@ void aiu_vec_lengthen_to_fp32(vec_int16 a, vec_float32 *out1, vec_int16 aiu_vec_convert_from_fp16(vec_int16 a); vec_int16 aiu_vec_convert_to_fp16(vec_int16 a); +#ifdef __MVS__ +#define VEC_ROUND_FROM_FP32(FP_HI, FP_LO) \ + aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&(FP_HI)), \ + *(vec_float32 *)((void *)&(FP_LO))); +#define VEC_LENGTHEN_TO_FP32(IN, OUT_HI, OUT_LO) \ + aiu_vec_lengthen_to_fp32((IN), (vec_float32 *)((void *)&(OUT_HI)), \ + (vec_float32 *)((void *)&(OUT_LO))); +#else +#define VEC_ROUND_FROM_FP32(FP_HI, FP_LO) \ + (vec_int16) vec_round_from_fp32((FP_HI), (FP_LO), 0); +/* These compiler intrinsics changed between GCC 13 and 14 from using + vector short to vector unsigned short. */ +#if __GNUC__ <= 13 +#define VEC_LENGTHEN_TO_FP32(IN, OUT_HI, OUT_LO) \ + (OUT_HI) = vec_extend_to_fp32_hi((vector short)(IN), 0); \ + (OUT_LO) = vec_extend_to_fp32_lo((vector short)(IN), 0); +#else +#define VEC_LENGTHEN_TO_FP32(IN, OUT_HI, OUT_LO) \ + (OUT_HI) = vec_extend_to_fp32_hi((IN), 0); \ + (OUT_LO) = vec_extend_to_fp32_lo((IN), 0); +#endif +#endif + // ----------------------------------------------------------------------------- // NNPA-MATMUL-OP function-specific-parameters and their bitfields // ----------------------------------------------------------------------------- From dfc5b87d5719f986742d7c771ecfddb56719ec67 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 20 Nov 2024 11:13:45 +0100 Subject: [PATCH 2/5] Add missing prototypes to testsupport.h Signed-off-by: Andreas Krebbel --- tests/testsupport.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/testsupport.h b/tests/testsupport.h index 6407579..f7c0da2 100644 --- a/tests/testsupport.h +++ b/tests/testsupport.h @@ -39,6 +39,8 @@ extern float ZERO_ARRAY[1]; // likely due to something's wrong with the testcase itself #define GENERAL_TESTCASE_FAILURE 0xDEADBEEF +void nhwc_2_nchw(void *nhwc_ptr, uint32_t n, uint32_t h, uint32_t w, uint32_t c, + int element_size, void *nchw_ptr); size_t *alloc_offsets(zdnn_ztensor *ztensor); size_t *alloc_rnn_offsets(const zdnn_ztensor *ztensor); size_t *alloc_rnn_output_offsets(const zdnn_ztensor *ztensor); @@ -228,6 +230,10 @@ extern char error_message[ERROR_MESSAGE_STR_LENGTH]; extern zdnn_data_types test_datatype; +void UnityDefaultTestRunWithAllPreDataType(UnityTestFunction Func, + const char *FuncName, + const int FuncLineNum); + void UnityDefaultTestRunWithDLFloat16PreDataType(UnityTestFunction Func, const char *FuncName, const int FuncLineNum); @@ -240,6 +246,10 @@ void UnityDefaultTestRunWithIndexPreDataType(UnityTestFunction Func, const char *FuncName, const int FuncLineNum); +void UnityDefaultTestRunWithAllTfrmdDataType(UnityTestFunction Func, + const char *FuncName, + const int FuncLineNum); + void UnityDefaultTestRunWithDLFloat16TfrmdDataType(UnityTestFunction Func, const char *FuncName, const int FuncLineNum); From e976de7d1079d3bb4aee4dc5841f0867e16f8612 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 20 Nov 2024 14:29:05 +0100 Subject: [PATCH 3/5] Get rid of type-aliasing warning Signed-off-by: Andreas Krebbel --- zdnn/convert_hw.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/zdnn/convert_hw.c b/zdnn/convert_hw.c index d3ab807..ddc7be2 100644 --- a/zdnn/convert_hw.c +++ b/zdnn/convert_hw.c @@ -377,8 +377,13 @@ float cnvt_1_fp16_to_fp32(uint16_t a) { } // convert 1 FP32 element to BFLOAT -// cppcheck-suppress invalidPointerCast -uint16_t cnvt_1_fp32_to_bfloat(float a) { return *(uint16_t *)(&a); } +uint16_t cnvt_1_fp32_to_bfloat(float a) { + union { + float in; + uint16_t out; + } u = {.in = a}; + return u.out; +} // convert 1 FP32 element to FP16 uint16_t cnvt_1_fp32_to_fp16(float a) { From 235f0d76842c97c04ea007394f2de265aa8ce767 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Wed, 20 Nov 2024 14:15:10 +0100 Subject: [PATCH 4/5] Go down to z14 to support older distros Signed-off-by: Andreas Krebbel --- config.zdnn | 4 ++-- zdnn/zdnn_private.h | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/config.zdnn b/config.zdnn index 524c2dc..865a16b 100644 --- a/config.zdnn +++ b/config.zdnn @@ -29,13 +29,13 @@ case "${target}" in CFLAGS_INIT="-O3 -mzvector -Wall -std=gnu99 -fstack-protector-all ${CFLAGS_INIT:-}" CFLAGS_QUOTE_INIT="-Wall" # Not needed on Linux. Just repeat an option to prevent it from being empty. CFLAGS_OPT_EXPENSIVE="-funroll-loops" - CFLAGS="-O3 -march=z16 -mzvector -Wall -std=gnu99 -fstack-protector-all ${CFLAGS_OPT_EXPENSIVE} ${CFLAGS:-}" + CFLAGS="-O3 -march=z14 -mzvector -Wall -std=gnu99 -fstack-protector-all ${CFLAGS_OPT_EXPENSIVE} ${CFLAGS:-}" CFLAGS_QUOTE="-Wall" CFLAGS_DEBUG="-O0 -g3 ${CFLAGS_DEBUG:-}" CFLAGS_SHARED="-fPIC ${CFLAGS_SHARED:-}" CFLAGS_ASM="-Wa,-adhln -fno-asynchronous-unwind-tables ${CFLAGS_ASM:-}" CFLAGS_NOSEARCH="" - CXXFLAGS="-O3 -march=z16 -Wall ${CXXFLAGS:-}" + CXXFLAGS="-O3 -march=z14 -Wall ${CXXFLAGS:-}" CPP_SYMCHECK_FLAGS="-E -o zdnn.i" SODIR="${SODIR:-lib}" LIBNAME="${LIBNAME:-libzdnn}" diff --git a/zdnn/zdnn_private.h b/zdnn/zdnn_private.h index 806e8cf..dd9690c 100644 --- a/zdnn/zdnn_private.h +++ b/zdnn/zdnn_private.h @@ -589,13 +589,12 @@ void aiu_vec_lengthen_to_fp32(vec_int16 a, vec_float32 *out1, vec_int16 aiu_vec_convert_from_fp16(vec_int16 a); vec_int16 aiu_vec_convert_to_fp16(vec_int16 a); -#ifdef __MVS__ +#if defined(__MVS__) || (defined(__ARCH__) && __ARCH__ < 14) #define VEC_ROUND_FROM_FP32(FP_HI, FP_LO) \ - aiu_vec_round_from_fp32(*(vec_float32 *)((void *)&(FP_HI)), \ - *(vec_float32 *)((void *)&(FP_LO))); + aiu_vec_round_from_fp32((vec_float32)(FP_HI), (vec_float32)(FP_LO)); #define VEC_LENGTHEN_TO_FP32(IN, OUT_HI, OUT_LO) \ - aiu_vec_lengthen_to_fp32((IN), (vec_float32 *)((void *)&(OUT_HI)), \ - (vec_float32 *)((void *)&(OUT_LO))); + aiu_vec_lengthen_to_fp32((IN), (vec_float32 *)&(OUT_HI), \ + (vec_float32 *)&(OUT_LO)); #else #define VEC_ROUND_FROM_FP32(FP_HI, FP_LO) \ (vec_int16) vec_round_from_fp32((FP_HI), (FP_LO), 0); @@ -1542,6 +1541,32 @@ void dumpdata_ztensor(const zdnn_ztensor *ztensor, dump_mode mode, #define PADDED(x) \ ((uint32_t)CEIL(x, AIU_2BYTE_CELLS_PER_STICK) * AIU_2BYTE_CELLS_PER_STICK) +#if !defined(vec_float) || __ARCH__ < 13 +#undef vec_float +#define vec_float(X) \ + ({ \ + __vector float out; \ + /* vcefb\t%[out],%[in],0,0 */ \ + __asm__(".insn vrr,0xe700000020c3,%[out],%[in],0,2,0,0" \ + : [out] "=v"(out) \ + : [in] "v"(X)); \ + out; \ + }) +#endif + +#if defined(__GNUC__) && __GNUC__ <= 7 +#undef vec_round +#define vec_round(X) \ + ({ \ + __vector float out; \ + /* vfisb %[out],%[in],4,4 */ \ + __asm__(".insn vrr,0xe700000020c7,%[out],%[in],0,2,4,4" \ + : [out] "=v"(out) \ + : [in] "v"(X)); \ + out; \ + }) +#endif + // ----------------------------------------------------------------------------- // Private global variables // ----------------------------------------------------------------------------- From 33d3b764fc9d464bbf55a05499067ecbce0d9fc6 Mon Sep 17 00:00:00 2001 From: Nicholas Marion Date: Fri, 22 Nov 2024 10:41:35 -0500 Subject: [PATCH 5/5] Resolve CPPCheck errors. Signed-off-by: Nicholas Marion --- zdnn/aiu_quantized_matmul.c | 2 -- zdnn/convert_hw.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/zdnn/aiu_quantized_matmul.c b/zdnn/aiu_quantized_matmul.c index 9036fa7..18a910e 100644 --- a/zdnn/aiu_quantized_matmul.c +++ b/zdnn/aiu_quantized_matmul.c @@ -1463,7 +1463,6 @@ static void apply_correction_term(const zdnn_ztensor *input_a, vec_fp32 vec_MZa = vec_splats(M * input_a->offset); float term_a[input_a->transformed_desc->dim2]; - // cppcheck-suppress unassignedVariable float term_b[input_b->transformed_desc->dim1 + 7]; vec_fp32 vec_clip_min = vec_splats((float)clip_min); @@ -1723,7 +1722,6 @@ static void apply_correction_term_on_the_fly( vec_fp32 vec_MZa = vec_splats(M * input_a->offset); float term_a[input_a->transformed_desc->dim2]; - // cppcheck-suppress unassignedVariable float term_b[input_b->transformed_desc->dim1 + 7]; vec_fp32 vec_clip_min = vec_splats((float)clip_min); diff --git a/zdnn/convert_hw.c b/zdnn/convert_hw.c index ddc7be2..a793dd9 100644 --- a/zdnn/convert_hw.c +++ b/zdnn/convert_hw.c @@ -379,7 +379,7 @@ float cnvt_1_fp16_to_fp32(uint16_t a) { // convert 1 FP32 element to BFLOAT uint16_t cnvt_1_fp32_to_bfloat(float a) { union { - float in; + float in; // cppcheck-suppress unusedStructMember uint16_t out; } u = {.in = a}; return u.out;