enable arm64 decode #2

emmansun · 2023-11-03T06:27:10Z

Decode STD mode should be ok, reference https://github.com/aklomp/base64/blob/master/lib/arch/neon64/dec_loop.c, need to check URL mode.

STD mode:

// The input consists of five valid character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
//   #  From       To        LUT  Characters
//   1  [0..42]    [255]      #1  invalid input
//   2  [43]       [62]       #1  +
//   3  [44..46]   [255]      #1  invalid input
//   4  [47]       [63]       #1  /
//   5  [48..57]   [52..61]   #1  0..9
//   6  [58..63]   [255]      #1  invalid input
//   7  [64]       [255]      #2  invalid input
//   8  [65..90]   [0..25]    #2  A..Z
//   9  [91..96]   [255]      #2  invalid input
//  10  [97..122]  [26..51]   #2  a..z
//  11  [123..126] [255]      #2  invalid input
// (12) Everything else => invalid input

// The first LUT will use the VTBL instruction (out of range indices are set to
// 0 in destination).
static const uint8_t dec_lut1[] = {
	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,  62U, 255U, 255U, 255U,  63U,
	 52U,  53U,  54U,  55U,  56U,  57U,  58U,  59U,  60U,  61U, 255U, 255U, 255U, 255U, 255U, 255U,
};

// The second LUT will use the VTBX instruction (out of range indices will be
// unchanged in destination). Input [64..126] will be mapped to index [1..63]
// in this LUT. Index 0 means that value comes from LUT #1.
static const uint8_t dec_lut2[] = {
	  0U, 255U,   0U,   1U,   2U,   3U,   4U,   5U,   6U,   7U,   8U,   9U,  10U,  11U,  12U,  13U,
	 14U,  15U,  16U,  17U,  18U,  19U,  20U,  21U,  22U,  23U,  24U,  25U, 255U, 255U, 255U, 255U,
	255U, 255U,  26U,  27U,  28U,  29U,  30U,  31U,  32U,  33U,  34U,  35U,  36U,  37U,  38U,  39U,
	 40U,  41U,  42U,  43U,  44U,  45U,  46U,  47U,  48U,  49U,  50U,  51U, 255U, 255U, 255U, 255U,
};

URL mode:

// The input consists of five valid character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
//   #  From       To        LUT  Characters
//   1  [0..44]    [255]      #1  invalid input
//   2  [45]       [62]       #1  -
//   3  [46..47]   [255]      #1  invalid input
//   4  [48..57]   [52..61]   #1  0..9
//   5  [58..63]   [255]      #1  invalid input
//   6  [64]       [255]      #2  invalid input
//   7  [65..90]   [0..25]    #2  A..Z
//   8  [91..94]   [255]      #2  invalid input
//   9  [95]       [63]       #2  _   
//  10  [96]       [255]      #2  invalid input
//  11  [97..122]  [26..51]   #2  a..z
//  12  [123..126] [255]      #2  invalid input
// (13) Everything else => invalid input

// The first LUT will use the VTBL instruction (out of range indices are set to
// 0 in destination).
static const uint8_t dec_lut1[] = {
	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,  255U, 255U, 62U, 255U,  255U,
	 52U,  53U,  54U,  55U,  56U,  57U,  58U,  59U,  60U,  61U, 255U, 255U, 255U, 255U, 255U, 255U,
};

// The second LUT will use the VTBX instruction (out of range indices will be
// unchanged in destination). Input [64..126] will be mapped to index [1..63]
// in this LUT. Index 0 means that value comes from LUT #1.
static const uint8_t dec_lut2[] = {
	  0U, 255U,   0U,   1U,   2U,   3U,   4U,   5U,   6U,   7U,   8U,   9U,  10U,  11U,  12U,  13U,
	 14U,  15U,  16U,  17U,  18U,  19U,  20U,  21U,  22U,  23U,  24U,  25U, 255U, 255U, 255U, 255U,
	 63U, 255U,  26U,  27U,  28U,  29U,  30U,  31U,  32U,  33U,  34U,  35U,  36U,  37U,  38U,  39U,
	 40U,  41U,  42U,  43U,  44U,  45U,  46U,  47U,  48U,  49U,  50U,  51U, 255U, 255U, 255U, 255U,
};

Need to check if Golang ASM supports inst. UQSUB (vqsubq_u8) / CMHI (vcgtq_u8) / UMAXV (vmaxvq_u8).

emmansun · 2023-11-05T02:43:16Z

VTBX指令从Golang1.20才引入，要支持低版本的Go，除非使用指令字编码。后来改为指令字编码。
UQSUB (vqsubq_u8) / CMHI (vcgtq_u8) 使用指令字编码。
不用UMAXV (vmaxvq_u8)和0比较，直接用Scalar指令比较。不过，后来还是用了UMAXV (vmaxvq_u8)。

emmansun · 2023-11-06T00:44:50Z

// UQSUB Vd.16B,Vn.16B,Vm.16B
func uqsub(Vd, Vn, Vm byte) uint32 {
	inst := uint32(0x6e202c00) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 | (uint32(Vm&0x1f) << 16)
	return inst
}

// CMHI Vd.16B,Vn.16B,Vm.16B
func cmhi(Vd, Vn, Vm byte) uint32 {
	inst := uint32(0x6e203400) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 | (uint32(Vm&0x1f) << 16)
	return inst	
}

// UMAXV <V><d>, <Vn>.<t>
func umax(Rd, Vn byte) uint32 {
	inst := uint32(0x6e30a800) | uint32(Rd&0x1f) | uint32(Vn&0x1f)<<5
	return inst	
}

// TBX Vd.Ta, { Vn.16B, <Vn+1>.16B, <Vn+2>.16B, <Vn+3>.16B }, Vm.Ta ; Four register table
func tbx4(Vd, Vn, Vm byte) uint32 {
	inst := uint32(0x4e007000) | uint32(Vd&0x1f) | uint32(Vn&0x1f)<<5 | (uint32(Vm&0x1f) << 16)
	return inst	
}

func TestGenInstr(t *testing.T) {
	fmt.Printf("WORD $0x%08x\n", uqsub(16, 0, 7))
	fmt.Printf("WORD $0x%08x\n", uqsub(17, 1, 7))
	fmt.Printf("WORD $0x%08x\n", uqsub(18, 2, 7))
	fmt.Printf("WORD $0x%08x\n", uqsub(19, 3, 7))	
	fmt.Printf("WORD $0x%08x\n", cmhi(16, 0, 7))
	fmt.Printf("WORD $0x%08x\n", cmhi(17, 1, 7))
	fmt.Printf("WORD $0x%08x\n", cmhi(18, 2, 7))
	fmt.Printf("WORD $0x%08x\n", cmhi(19, 3, 7))
	fmt.Printf("WORD $0x%08x\n", umax(17, 16))
	fmt.Printf("WORD $0x%08x\n", tbx4(16, 12, 16))
	fmt.Printf("WORD $0x%08x\n", tbx4(17, 12, 17))
	fmt.Printf("WORD $0x%08x\n", tbx4(18, 12, 18))
	fmt.Printf("WORD $0x%08x\n", tbx4(19, 12, 19))
}

emmansun added the enhancement New feature or request label Nov 3, 2023

emmansun self-assigned this Nov 3, 2023

emmansun added a commit that referenced this issue Nov 6, 2023

enable arm64 decode #2

daa8f0a

emmansun added a commit that referenced this issue Nov 6, 2023

fix bug #2

53ae6ee

emmansun added a commit that referenced this issue Nov 6, 2023

Fix copy paste issue #2

33d4192

emmansun added a commit that referenced this issue Nov 6, 2023

test VUMAXV #2

e0902f9

emmansun added a commit that referenced this issue Nov 6, 2023

uses tbx inst. word, supports lower version #2

e9de1b9

emmansun added a commit that referenced this issue Nov 6, 2023

change Q value, 8B to 16B #2

7502f55

emmansun closed this as completed Nov 6, 2023

emmansun mentioned this issue Nov 7, 2024

arm64: decode minor optimization #21

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

enable arm64 decode #2

enable arm64 decode #2

emmansun commented Nov 3, 2023 •

edited

Loading

emmansun commented Nov 5, 2023 •

edited

Loading

emmansun commented Nov 6, 2023 •

edited

Loading

enable arm64 decode #2

enable arm64 decode #2

Comments

emmansun commented Nov 3, 2023 • edited Loading

emmansun commented Nov 5, 2023 • edited Loading

emmansun commented Nov 6, 2023 • edited Loading

emmansun commented Nov 3, 2023 •

edited

Loading

emmansun commented Nov 5, 2023 •

edited

Loading

emmansun commented Nov 6, 2023 •

edited

Loading