You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
``asm
// Module with 1 kernels, global data with 0 words (64-bit each), starting at offset 1 words and 0 words of stack-frame
// Kernel 'loop1' with 49 instructions, offset 2, with following parameters: __global float* a (4 B, 1 items)
or -, unif, unif
or -, unif, unif
or r0, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or ra0, unif, unif
or r0, r0, r0
ldi r1, 255
and r0, r0, r1
and r0, r0, r1
shl r0, r0, 4 (4)
shl r0, r0, 2 (2)
add r0, ra0, r0
or r2, r0, r0
mul24 r1, 4 (4), elem_num
v8adds r0, 8 (8), 8 (8)
sub.setf -, elem_num, r0
or.ifnc r0, 0 (0), 0 (0)
add.ifn r0, r2, r1
or tmu0s, r0, r0
nop.load_tmu0.never
or r0, 2.000000 (33), 2.000000 (33)
fmul r0, r4, r0
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
or vpm, r0, r0
ldi vpw_setup, vdw_setup(rows: 1, elements: 16 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r2, r2
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or r0, unif, unif
or.setf -, elem_num, r0
brr.ifallzc (pc+4) + -45
nop.never
nop.never
nop.never
not irq, qpu_num
nop.thrend.never
nop.never
nop.never
In my opinion, this should be like:
```asm
;; This buffer is for text that is not saved, and for Lisp evaluation.
;; To create a file, visit it with <open> and enter text in its buffer.
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or r0, unif, unif
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
or -, unif, unif // should be removed
// label1: should be beginning of loop
or ra0, unif, unif
or r0, r0, r0 // just duplication, should be removed
ldi r1, 255 //
and r0, r0, r1 // these have no effects
and r0, r0, r1 //
shl r0, r0, 4 (4) //
shl r0, r0, 2 (2) // why shifted ?
add r0, ra0, r0 //
or r2, r0, r0 // r0 is reused by `v8add`. why need copy?
mul24 r1, 4 (4), elem_num // this should compute only once, should move to outside of loop
v8adds r0, 8 (8), 8 (8) // this should compute only once, should move to outside of loop
sub.setf -, elem_num, r0
or.ifnc r0, 0 (0), 0 (0) // r0 is re-assigned in next instruction, no affects
add.ifn r0, r2, r1
or tmu0s, r0, r0
nop.load_tmu0.never
or r0, 2.000000 (33), 2.000000 (33) // 2.0 includes smallImm, can be fused
fmul r0, r4, r0 //
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
or vpm, r0, r0
ldi vpw_setup, vdw_setup(rows: 1, elements: 16 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r2, r2
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or r0, unif, unif
or.setf -, elem_num, r0
brr.ifallzc (pc+4) + -45 // should be jump to label1
nop.never // should be replaced to meaningful op
nop.never // should be replaced to meaningful op
nop.never // should be replaced to meaningful op
not irq, qpu_num
nop.thrend.never
nop.never
nop.never
So we need:
Optimize layout of parameters
Enhancement of basic optimization (remove redundant moves, peepohole)
Dual issue (by Instruction Scheduler)
Use both TMU0 and TMU1 (by Instruction Scheduler)
Improve register allocation (but it might be very difficult)
How about make first release after implementation of such improvement? @doe300 What do you think of that?
The text was updated successfully, but these errors were encountered:
If have an idea of how to implement it and will try it out.
I think (as discussed a few times in other optimizations), most of the redundant moves come from mapping the source and destination of a move to the same register. Though it would be very helpful to remove those moves, I currently see no efficient way to do this (since at the time the locals are mapped to registers, we cannot remove any instruction anymore)
I spent a lot of time writing the register-allocator from scratch and its definitively still not perfect. But you are also right, the architecture (2 separate register-files, restrictions on which can be used where) make it very hard to write/improve register-allocation for the VideoCore IV.
Thanks. I agree that register-allocation for VC4 is very hard to implement efficiently.
For first release, I want to deal with relatively easy tasks that we can implement now.
Now I am trying to analyse what optimizations are necessary now....
I am thinking how to develop
VC4C
in efficient way.I really make it worth to adapt realistic application.
To do that, how about setting milestone for first release.
If you permit, I want to list current improvements what we should now.
In my opinion, we need evaluate output of
VC4C
and compare this with ideal output.For example, current
VC4C
of master output is as follows:``asm
// Module with 1 kernels, global data with 0 words (64-bit each), starting at offset 1 words and 0 words of stack-frame
// Kernel 'loop1' with 49 instructions, offset 2, with following parameters: __global float* a (4 B, 1 items)
or -, unif, unif
or -, unif, unif
or r0, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or -, unif, unif
or ra0, unif, unif
or r0, r0, r0
ldi r1, 255
and r0, r0, r1
and r0, r0, r1
shl r0, r0, 4 (4)
shl r0, r0, 2 (2)
add r0, ra0, r0
or r2, r0, r0
mul24 r1, 4 (4), elem_num
v8adds r0, 8 (8), 8 (8)
sub.setf -, elem_num, r0
or.ifnc r0, 0 (0), 0 (0)
add.ifn r0, r2, r1
or tmu0s, r0, r0
nop.load_tmu0.never
or r0, 2.000000 (33), 2.000000 (33)
fmul r0, r4, r0
or -, mutex_acq, mutex_acq
ldi vpw_setup, vpm_setup(size: 16 words, stride: 1 rows, address: h32(0))
or vpm, r0, r0
ldi vpw_setup, vdw_setup(rows: 1, elements: 16 words, address: h32(0))
ldi vpw_setup, vdw_setup(stride: 0)
or vpw_addr, r2, r2
or -, vpw_wait, vpw_wait
or mutex_rel, 1 (1), 1 (1)
or r0, unif, unif
or.setf -, elem_num, r0
brr.ifallzc (pc+4) + -45
nop.never
nop.never
nop.never
not irq, qpu_num
nop.thrend.never
nop.never
nop.never
So we need:
How about make first release after implementation of such improvement?
@doe300 What do you think of that?
The text was updated successfully, but these errors were encountered: