// // Generated by LLVM NVPTX Back-End // .version 7.0 .target sm_80 .address_size 64 // .globl _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE // -- Begin function _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE .extern .shared .align 32 .b8 shmem[]; // @_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE .visible .entry _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE( .param .align 8 .b8 _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_0[8], .param .align 8 .b8 _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_1[40], .param .align 8 .b8 _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_2[40], .param .align 8 .b8 _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_3[40], .param .align 8 .b8 _Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_4[40] ) { .reg .pred %p<6>; .reg .b16 %rs<4>; .reg .f32 %f<657>; .reg .b32 %r<14>; .reg .b64 %rd<383>; // %bb.0: // %conversion ld.param.u64 %rd1, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_1]; ld.param.u64 %rd60, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_1+16]; ld.param.u64 %rd2, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_2]; ld.param.u64 %rd61, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_2+16]; ld.param.u64 %rd62, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_3]; ld.param.u64 %rd63, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_3+16]; ld.param.u64 %rd3, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_4]; ld.param.u64 %rd4, [_Z16matmul_pipelined13CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EES_IS0_Li2ELi1EE11ElementwiseI8identityES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_ES1_IS2_E7Default4TypeI6ConfigI30_M___2560__N___2048__K___2048_24_M___64__N___64__K___32_Li8E15_M___4__K___32_14_M___1__K___4_15_K___2__N___64_14_K___1__N___4_15_M___64__N___2_14_M___4__N___1_23_M___16__N___32__K___2_22_M___8__N___16__K___2_15AlignedRowMajorIS0_ES6_IS0_E15AlignedColMajorIS0_ES7_IS0_E6PaddedIS6_IS0_ELi4EES8_IS6_IS0_ELi4EES7_IS0_ES7_IS0_E13TropicalFPUOpILi8ELi16ELi2ES0_S0_ELinfalseELinfalseEEE_param_4+16]; mov.u32 %r1, %ctaid.x; mul.wide.u32 %rd64, %r1, 64; mov.u32 %r2, %ctaid.y; shl.b32 %r3, %r2, 6; cvt.u64.u32 %rd65, %r3; mov.u32 %r4, %tid.x; shr.u32 %r5, %r4, 5; cvt.u64.u32 %rd66, %r4; and.b32 %r6, %r4, 31; shl.b32 %r7, %r5, 1; cvt.u64.u32 %rd67, %r7; cvt.u64.u32 %rd68, %r6; shr.u64 %rd69, %rd68, 4; mul.wide.u32 %rd70, %r6, 4; and.b64 %rd71, %rd70, 60; or.b64 %rd5, %rd69, %rd67; or.b64 %rd6, %rd5, %rd65; max.s64 %rd72, %rd63, 0; mul.lo.s64 %rd73, %rd6, %rd72; shl.b64 %rd74, %rd5, 6; or.b64 %rd75, %rd71, %rd74; or.b64 %rd7, %rd71, %rd64; add.s64 %rd76, %rd7, %rd73; shr.s64 %rd77, %rd76, 63; shr.u64 %rd78, %rd77, 62; add.s64 %rd79, %rd76, %rd78; shl.b64 %rd80, %rd79, 2; and.b64 %rd81, %rd80, -16; add.s64 %rd82, %rd62, %rd81; ld.global.v4.f32 {%f241, %f242, %f243, %f244}, [%rd82]; mov.u64 %rd83, shmem; shl.b64 %rd84, %rd75, 2; add.s64 %rd8, %rd83, %rd84; st.shared.v4.f32 [%rd8], {%f241, %f242, %f243, %f244}; shl.b64 %rd85, %rd72, 4; add.s64 %rd86, %rd76, %rd85; shr.s64 %rd87, %rd86, 63; shr.u64 %rd88, %rd87, 62; add.s64 %rd89, %rd86, %rd88; shl.b64 %rd90, %rd89, 2; and.b64 %rd91, %rd90, -16; add.s64 %rd92, %rd62, %rd91; ld.global.v4.f32 {%f245, %f246, %f247, %f248}, [%rd92]; add.s64 %rd9, %rd8, 4096; st.shared.v4.f32 [%rd8+4096], {%f245, %f246, %f247, %f248}; add.s64 %rd93, %rd86, %rd85; shr.s64 %rd94, %rd93, 63; shr.u64 %rd95, %rd94, 62; add.s64 %rd96, %rd93, %rd95; shl.b64 %rd97, %rd96, 2; and.b64 %rd98, %rd97, -16; add.s64 %rd99, %rd62, %rd98; ld.global.v4.f32 {%f249, %f250, %f251, %f252}, [%rd99]; add.s64 %rd10, %rd8, 8192; st.shared.v4.f32 [%rd8+8192], {%f249, %f250, %f251, %f252}; add.s64 %rd100, %rd93, %rd85; shr.s64 %rd101, %rd100, 63; shr.u64 %rd102, %rd101, 62; add.s64 %rd103, %rd100, %rd102; shl.b64 %rd104, %rd103, 2; and.b64 %rd105, %rd104, -16; add.s64 %rd106, %rd62, %rd105; ld.global.v4.f32 {%f253, %f254, %f255, %f256}, [%rd106]; add.s64 %rd11, %rd8, 12288; st.shared.v4.f32 [%rd8+12288], {%f253, %f254, %f255, %f256}; bar.sync 0; cvt.u64.u32 %rd107, %r5; mul.wide.u32 %rd108, %r5, 16; and.b64 %rd109, %rd108, 48; mul.wide.u32 %rd110, %r5, 8; and.b64 %rd111, %rd110, 224; cvt.u16.u32 %rs1, %r4; and.b16 %rs2, %rs1, 28; and.b64 %rd112, %rd66, 3; shr.u16 %rs3, %rs2, 2; cvt.u64.u16 %rd113, %rs3; or.b64 %rd114, %rd111, %rd113; shl.b64 %rd115, %rd114, 6; or.b64 %rd116, %rd108, %rd112; or.b64 %rd117, %rd116, -52; add.s64 %rd118, %rd115, %rd117; add.s64 %rd119, %rd115, 64; or.b64 %rd120, %rd109, %rd112; or.b64 %rd121, %rd120, 4; or.b64 %rd122, %rd121, %rd119; shl.b64 %rd123, %rd122, 2; add.s64 %rd12, %rd83, %rd123; ld.shared.f32 %f593, [%rd12+-272]; add.s64 %rd124, %rd115, 576; or.b64 %rd125, %rd121, %rd124; shl.b64 %rd126, %rd125, 2; add.s64 %rd13, %rd83, %rd126; ld.shared.f32 %f595, [%rd13+-272]; or.b64 %rd127, %rd120, 8; or.b64 %rd128, %rd127, %rd119; shl.b64 %rd129, %rd128, 2; add.s64 %rd14, %rd83, %rd129; ld.shared.f32 %f594, [%rd14+-272]; or.b64 %rd130, %rd127, %rd124; shl.b64 %rd131, %rd130, 2; add.s64 %rd15, %rd83, %rd131; ld.shared.f32 %f596, [%rd15+-272]; or.b64 %rd132, %rd111, 9; add.s64 %rd133, %rd132, %rd113; shl.b64 %rd134, %rd133, 6; add.s64 %rd135, %rd134, 512; or.b64 %rd136, %rd121, %rd135; shl.b64 %rd137, %rd136, 2; add.s64 %rd16, %rd83, %rd137; ld.shared.f32 %f601, [%rd16+-272]; add.s64 %rd138, %rd134, 1024; or.b64 %rd139, %rd121, %rd138; shl.b64 %rd140, %rd139, 2; add.s64 %rd17, %rd83, %rd140; ld.shared.f32 %f603, [%rd17+-272]; or.b64 %rd141, %rd127, %rd135; shl.b64 %rd142, %rd141, 2; add.s64 %rd18, %rd83, %rd142; ld.shared.f32 %f602, [%rd18+-272]; or.b64 %rd143, %rd127, %rd138; shl.b64 %rd144, %rd143, 2; add.s64 %rd19, %rd83, %rd144; ld.shared.f32 %f604, [%rd19+-272]; or.b64 %rd145, %rd116, -60; add.s64 %rd146, %rd145, %rd119; shl.b64 %rd147, %rd146, 2; add.s64 %rd20, %rd83, %rd147; ld.shared.f32 %f597, [%rd20+16]; add.s64 %rd148, %rd145, %rd124; shl.b64 %rd149, %rd148, 2; add.s64 %rd21, %rd83, %rd149; ld.shared.f32 %f599, [%rd21+16]; shl.b64 %rd150, %rd118, 2; add.s64 %rd151, %rd150, %rd83; add.s64 %rd22, %rd151, -1792; ld.shared.f32 %f598, [%rd151+256]; ld.shared.f32 %f600, [%rd151+2304]; add.s64 %rd152, %rd145, %rd135; shl.b64 %rd153, %rd152, 2; add.s64 %rd23, %rd83, %rd153; ld.shared.f32 %f605, [%rd23+16]; add.s64 %rd154, %rd145, %rd138; shl.b64 %rd155, %rd154, 2; add.s64 %rd24, %rd83, %rd155; ld.shared.f32 %f607, [%rd24+16]; add.s64 %rd156, %rd117, %rd134; shl.b64 %rd157, %rd156, 2; add.s64 %rd25, %rd83, %rd157; ld.shared.f32 %f606, [%rd25+2048]; ld.shared.f32 %f608, [%rd25+4096]; bar.sync 0; shr.u64 %rd158, %rd107, 2; shr.u64 %rd159, %rd107, 3; shl.b64 %rd26, %rd159, 1; sub.s64 %rd160, %rd158, %rd26; shl.b64 %rd161, %rd160, 5; shl.b32 %r8, %r5, 2; cvt.u64.u32 %rd162, %r8; shr.u64 %rd163, %rd68, 3; and.b64 %rd27, %rd70, 28; or.b64 %rd164, %rd163, %rd162; add.s64 %rd165, %rd164, %rd64; max.s64 %rd166, %rd60, 0; mul.lo.s64 %rd28, %rd165, %rd166; add.s64 %rd167, %rd28, %rd27; shr.s64 %rd168, %rd167, 63; shr.u64 %rd169, %rd168, 62; add.s64 %rd170, %rd167, %rd169; shl.b64 %rd171, %rd170, 2; and.b64 %rd172, %rd171, -16; add.s64 %rd173, %rd1, %rd172; ld.global.v4.f32 {%f257, %f258, %f259, %f260}, [%rd173]; shl.b64 %rd29, %rd166, 5; add.s64 %rd174, %rd167, %rd29; shr.s64 %rd175, %rd174, 63; shr.u64 %rd176, %rd175, 62; add.s64 %rd177, %rd174, %rd176; shl.b64 %rd178, %rd177, 2; and.b64 %rd179, %rd178, -16; add.s64 %rd180, %rd1, %rd179; ld.global.v4.f32 {%f261, %f262, %f263, %f264}, [%rd180]; max.s64 %rd30, %rd61, 0; mul.lo.s64 %rd181, %rd5, %rd30; or.b64 %rd31, %rd71, %rd65; add.s64 %rd182, %rd31, %rd181; shr.s64 %rd183, %rd182, 63; shr.u64 %rd184, %rd183, 62; add.s64 %rd185, %rd182, %rd184; shl.b64 %rd186, %rd185, 2; and.b64 %rd187, %rd186, -16; add.s64 %rd188, %rd2, %rd187; ld.global.v4.f32 {%f265, %f266, %f267, %f268}, [%rd188]; shl.b64 %rd32, %rd30, 4; add.s64 %rd189, %rd182, %rd32; shr.s64 %rd190, %rd189, 63; shr.u64 %rd191, %rd190, 62; add.s64 %rd192, %rd189, %rd191; shl.b64 %rd193, %rd192, 2; and.b64 %rd194, %rd193, -16; add.s64 %rd195, %rd2, %rd194; ld.global.v4.f32 {%f269, %f270, %f271, %f272}, [%rd195]; mul.lo.s64 %rd196, %rd164, 36; add.s64 %rd197, %rd27, %rd196; shl.b64 %rd198, %rd197, 2; add.s64 %rd33, %rd83, %rd198; st.shared.v4.f32 [%rd33], {%f257, %f258, %f259, %f260}; add.s64 %rd34, %rd33, 4608; st.shared.v4.f32 [%rd33+4608], {%f261, %f262, %f263, %f264}; mul.lo.s64 %rd199, %rd5, 68; add.s64 %rd200, %rd71, %rd199; add.s64 %rd201, %rd83, 9216; shl.b64 %rd202, %rd200, 2; add.s64 %rd35, %rd201, %rd202; st.shared.v4.f32 [%rd35], {%f265, %f266, %f267, %f268}; add.s64 %rd36, %rd35, 4352; st.shared.v4.f32 [%rd35+4352], {%f269, %f270, %f271, %f272}; bar.sync 0; and.b32 %r9, %r4, 3; add.s32 %r10, %r9, 1; cvt.u64.u32 %rd203, %r10; or.b64 %rd204, %rd109, %rd203; mul.lo.s64 %rd37, %rd204, 36; add.s64 %rd205, %rd26, %rd37; shl.b64 %rd206, %rd205, 2; add.s64 %rd207, %rd206, %rd83; ld.shared.v2.f32 {%f625, %f627}, [%rd207+-144]; add.s64 %rd38, %rd37, 144; add.s64 %rd208, %rd26, %rd38; shl.b64 %rd209, %rd208, 2; add.s64 %rd210, %rd209, %rd83; ld.shared.v2.f32 {%f626, %f628}, [%rd210+-144]; or.b64 %rd211, %rd109, 4; add.s64 %rd212, %rd211, %rd203; mul.lo.s64 %rd213, %rd212, 36; add.s64 %rd39, %rd213, 144; add.s64 %rd214, %rd26, %rd39; shl.b64 %rd215, %rd214, 2; add.s64 %rd216, %rd215, %rd83; ld.shared.v2.f32 {%f629, %f631}, [%rd216+-144]; add.s64 %rd40, %rd213, 288; add.s64 %rd217, %rd26, %rd40; shl.b64 %rd218, %rd217, 2; add.s64 %rd219, %rd218, %rd83; ld.shared.v2.f32 {%f630, %f632}, [%rd219+-144]; bfe.u32 %r11, %r4, 2, 3; add.s32 %r12, %r11, 1; cvt.u64.u32 %rd220, %r12; or.b64 %rd221, %rd26, 1; mul.lo.s64 %rd222, %rd221, 68; or.b64 %rd223, %rd161, %rd220; add.s64 %rd41, %rd223, 8; add.s64 %rd224, %rd41, %rd222; shl.b64 %rd225, %rd224, 2; add.s64 %rd226, %rd201, %rd225; ld.shared.f32 %f633, [%rd226+-308]; mul.lo.s64 %rd227, %rd159, 136; add.s64 %rd228, %rd227, 136; add.s64 %rd229, %rd41, %rd228; shl.b64 %rd230, %rd229, 2; add.s64 %rd231, %rd201, %rd230; ld.shared.f32 %f634, [%rd231+-308]; or.b64 %rd42, %rd223, 16; add.s64 %rd232, %rd42, %rd222; shl.b64 %rd233, %rd232, 2; add.s64 %rd234, %rd201, %rd233; ld.shared.f32 %f635, [%rd234+-308]; add.s64 %rd235, %rd42, %rd228; shl.b64 %rd236, %rd235, 2; add.s64 %rd237, %rd201, %rd236; ld.shared.f32 %f636, [%rd237+-308]; ld.shared.f32 %f637, [%rd226+-244]; ld.shared.f32 %f638, [%rd231+-244]; ld.shared.f32 %f639, [%rd234+-244]; ld.shared.f32 %f640, [%rd237+-244]; or.b64 %rd238, %rd27, 32; add.s64 %rd239, %rd238, %rd28; shr.s64 %rd240, %rd239, 63; shr.u64 %rd241, %rd240, 62; add.s64 %rd242, %rd239, %rd241; shl.b64 %rd243, %rd242, 2; and.b64 %rd244, %rd243, -16; add.s64 %rd245, %rd1, %rd244; ld.global.v4.f32 {%f649, %f650, %f651, %f652}, [%rd245]; add.s64 %rd246, %rd239, %rd29; shr.s64 %rd247, %rd246, 63; shr.u64 %rd248, %rd247, 62; add.s64 %rd249, %rd246, %rd248; shl.b64 %rd250, %rd249, 2; and.b64 %rd251, %rd250, -16; add.s64 %rd252, %rd1, %rd251; ld.global.v4.f32 {%f653, %f654, %f655, %f656}, [%rd252]; add.s64 %rd253, %rd5, 32; mul.lo.s64 %rd254, %rd253, %rd30; add.s64 %rd255, %rd31, %rd254; shr.s64 %rd256, %rd255, 63; shr.u64 %rd257, %rd256, 62; add.s64 %rd258, %rd255, %rd257; shl.b64 %rd259, %rd258, 2; and.b64 %rd260, %rd259, -16; add.s64 %rd261, %rd2, %rd260; ld.global.v4.f32 {%f641, %f642, %f643, %f644}, [%rd261]; add.s64 %rd262, %rd255, %rd32; shr.s64 %rd263, %rd262, 63; shr.u64 %rd264, %rd263, 62; add.s64 %rd265, %rd262, %rd264; shl.b64 %rd266, %rd265, 2; and.b64 %rd267, %rd266, -16; add.s64 %rd268, %rd2, %rd267; ld.global.v4.f32 {%f645, %f646, %f647, %f648}, [%rd268]; mul.lo.s64 %rd269, %rd159, 288; shl.b64 %rd270, %rd158, 7; add.s64 %rd271, %rd269, %rd270; mul.wide.u32 %rd272, %r12, 4; add.s64 %rd273, %rd272, %rd271; add.s64 %rd274, %rd273, %rd83; add.s64 %rd43, %rd274, 11212; mul.lo.s64 %rd275, %rd204, 144; shl.b64 %rd276, %rd159, 3; add.s64 %rd277, %rd275, %rd276; add.s64 %rd278, %rd277, %rd83; add.s64 %rd44, %rd278, -136; bfe.u32 %r13, %r4, 5, 2; mul.wide.u32 %rd279, %r13, 2304; mul.wide.u32 %rd280, %r10, 144; add.s64 %rd281, %rd280, %rd279; add.s64 %rd282, %rd281, %rd276; add.s64 %rd283, %rd282, %rd83; add.s64 %rd45, %rd283, 1016; mov.u64 %rd59, 0; mov.u64 %rd379, %rd59; bra.uni LBB0_1; LBB0_6: // %L18271.L18279_crit_edge // in Loop: Header=BB0_1 Depth=1 bar.sync 0; add.s64 %rd55, %rd379, 32; setp.ne.s64 %p5, %rd379, 2016; mov.u64 %rd379, %rd55; @%p5 bra LBB0_1; bra.uni LBB0_7; LBB0_1: // %L11467 // =>This Loop Header: Depth=1 // Child Loop BB0_2 Depth 2 add.s64 %rd286, %rd379, 64; or.b64 %rd287, %rd286, %rd27; add.s64 %rd288, %rd287, %rd28; shr.s64 %rd289, %rd288, 63; shr.u64 %rd290, %rd289, 62; add.s64 %rd291, %rd288, %rd290; shl.b64 %rd292, %rd291, 2; and.b64 %rd293, %rd292, -16; add.s64 %rd48, %rd1, %rd293; add.s64 %rd294, %rd288, %rd29; shr.s64 %rd295, %rd294, 63; shr.u64 %rd296, %rd295, 62; add.s64 %rd297, %rd294, %rd296; shl.b64 %rd298, %rd297, 2; and.b64 %rd299, %rd298, -16; add.s64 %rd49, %rd1, %rd299; add.s64 %rd300, %rd286, %rd5; mul.lo.s64 %rd301, %rd300, %rd30; add.s64 %rd302, %rd31, %rd301; shr.s64 %rd303, %rd302, 63; shr.u64 %rd304, %rd303, 62; add.s64 %rd305, %rd302, %rd304; shl.b64 %rd306, %rd305, 2; and.b64 %rd307, %rd306, -16; add.s64 %rd50, %rd2, %rd307; add.s64 %rd308, %rd302, %rd32; shr.s64 %rd309, %rd308, 63; shr.u64 %rd310, %rd309, 62; add.s64 %rd311, %rd308, %rd310; shl.b64 %rd312, %rd311, 2; and.b64 %rd313, %rd312, -16; add.s64 %rd51, %rd2, %rd313; mov.u64 %rd380, 8; mov.u64 %rd381, %rd43; mov.u64 %rd382, %rd59; bra.uni LBB0_2; LBB0_5: // %L16930.3 // in Loop: Header=BB0_2 Depth=2 selp.b64 %rd316, 0, %rd380, %p1; or.b64 %rd317, %rd316, %rd26; add.s64 %rd318, %rd317, %rd37; shl.b64 %rd320, %rd318, 2; add.s64 %rd321, %rd320, %rd83; ld.shared.f32 %f625, [%rd321+-144]; ld.shared.f32 %f627, [%rd321+-140]; add.s64 %rd322, %rd317, %rd38; shl.b64 %rd323, %rd322, 2; add.s64 %rd324, %rd323, %rd83; ld.shared.f32 %f626, [%rd324+-144]; ld.shared.f32 %f628, [%rd324+-140]; add.s64 %rd325, %rd317, %rd39; shl.b64 %rd326, %rd325, 2; add.s64 %rd327, %rd326, %rd83; ld.shared.f32 %f629, [%rd327+-144]; ld.shared.f32 %f631, [%rd327+-140]; add.s64 %rd328, %rd317, %rd40; shl.b64 %rd329, %rd328, 2; add.s64 %rd330, %rd329, %rd83; ld.shared.f32 %f630, [%rd330+-144]; ld.shared.f32 %f632, [%rd330+-140]; or.b64 %rd331, %rd317, 1; mul.lo.s64 %rd332, %rd331, 68; add.s64 %rd333, %rd41, %rd332; shl.b64 %rd334, %rd333, 2; add.s64 %rd336, %rd201, %rd334; ld.shared.f32 %f633, [%rd336+-308]; mul.lo.s64 %rd337, %rd317, 68; add.s64 %rd338, %rd337, 136; add.s64 %rd339, %rd41, %rd338; shl.b64 %rd340, %rd339, 2; add.s64 %rd341, %rd201, %rd340; ld.shared.f32 %f634, [%rd341+-308]; add.s64 %rd342, %rd42, %rd332; shl.b64 %rd343, %rd342, 2; add.s64 %rd344, %rd201, %rd343; ld.shared.f32 %f635, [%rd344+-308]; add.s64 %rd345, %rd42, %rd338; shl.b64 %rd346, %rd345, 2; add.s64 %rd347, %rd201, %rd346; ld.shared.f32 %f636, [%rd347+-308]; ld.shared.f32 %f637, [%rd336+-244]; ld.shared.f32 %f638, [%rd341+-244]; ld.shared.f32 %f639, [%rd344+-244]; ld.shared.f32 %f640, [%rd347+-244]; add.f32 %f481, %f145, %f153; max.f32 %f482, %f481, %f161; add.f32 %f483, %f146, %f154; max.f32 %f593, %f483, %f482; add.f32 %f484, %f145, %f155; max.f32 %f485, %f484, %f162; add.f32 %f486, %f146, %f156; max.f32 %f595, %f486, %f485; add.f32 %f487, %f147, %f153; max.f32 %f488, %f487, %f163; add.f32 %f489, %f148, %f154; max.f32 %f594, %f489, %f488; add.f32 %f490, %f147, %f155; max.f32 %f491, %f490, %f164; add.f32 %f492, %f148, %f156; max.f32 %f596, %f492, %f491; add.f32 %f493, %f145, %f157; max.f32 %f494, %f493, %f165; add.f32 %f495, %f146, %f158; max.f32 %f601, %f495, %f494; add.f32 %f496, %f145, %f159; max.f32 %f497, %f496, %f166; add.f32 %f498, %f146, %f160; max.f32 %f603, %f498, %f497; add.f32 %f499, %f147, %f157; max.f32 %f500, %f499, %f167; add.f32 %f501, %f148, %f158; max.f32 %f602, %f501, %f500; add.f32 %f502, %f147, %f159; max.f32 %f503, %f502, %f168; add.f32 %f504, %f148, %f160; max.f32 %f604, %f504, %f503; add.f32 %f505, %f149, %f153; max.f32 %f506, %f505, %f169; add.f32 %f507, %f150, %f154; max.f32 %f597, %f507, %f506; add.f32 %f508, %f149, %f155; max.f32 %f509, %f508, %f170; add.f32 %f510, %f150, %f156; max.f32 %f599, %f510, %f509; add.f32 %f511, %f151, %f153; max.f32 %f512, %f511, %f171; add.f32 %f513, %f152, %f154; max.f32 %f598, %f513, %f512; add.f32 %f514, %f151, %f155; max.f32 %f515, %f514, %f172; add.f32 %f516, %f152, %f156; max.f32 %f600, %f516, %f515; add.f32 %f517, %f149, %f157; max.f32 %f518, %f517, %f173; add.f32 %f519, %f150, %f158; max.f32 %f605, %f519, %f518; add.f32 %f520, %f149, %f159; max.f32 %f521, %f520, %f174; add.f32 %f522, %f150, %f160; max.f32 %f607, %f522, %f521; add.f32 %f523, %f151, %f157; max.f32 %f524, %f523, %f175; add.f32 %f525, %f152, %f158; max.f32 %f606, %f525, %f524; add.f32 %f526, %f151, %f159; max.f32 %f527, %f526, %f176; add.f32 %f528, %f152, %f160; max.f32 %f608, %f528, %f527; add.s64 %rd382, %rd382, 32; add.s64 %rd381, %rd381, 2176; add.s64 %rd380, %rd380, 8; setp.eq.s64 %p4, %rd382, 128; @%p4 bra LBB0_6; LBB0_2: // %L11475 // Parent Loop BB0_1 Depth=1 // => This Inner Loop Header: Depth=2 .pragma "nounroll"; add.s64 %rd314, %rd44, %rd382; ld.shared.f32 %f273, [%rd314]; ld.shared.f32 %f274, [%rd314+4]; add.s64 %rd315, %rd45, %rd382; ld.shared.f32 %f275, [%rd315+-576]; ld.shared.f32 %f276, [%rd315+-572]; ld.shared.f32 %f277, [%rd315]; ld.shared.f32 %f278, [%rd315+4]; ld.shared.f32 %f279, [%rd315+576]; ld.shared.f32 %f280, [%rd315+580]; ld.shared.f32 %f281, [%rd381+-1456]; ld.shared.f32 %f282, [%rd381+-1184]; ld.shared.f32 %f283, [%rd381+-1424]; ld.shared.f32 %f284, [%rd381+-1152]; ld.shared.f32 %f285, [%rd381+-1392]; ld.shared.f32 %f286, [%rd381+-1120]; ld.shared.f32 %f287, [%rd381+-1360]; ld.shared.f32 %f288, [%rd381+-1088]; add.f32 %f289, %f625, %f633; max.f32 %f290, %f289, %f593; add.f32 %f291, %f627, %f634; max.f32 %f292, %f291, %f290; add.f32 %f293, %f625, %f635; max.f32 %f294, %f293, %f595; add.f32 %f295, %f627, %f636; max.f32 %f296, %f295, %f294; add.f32 %f297, %f626, %f633; max.f32 %f298, %f297, %f594; add.f32 %f299, %f628, %f634; max.f32 %f300, %f299, %f298; add.f32 %f301, %f626, %f635; max.f32 %f302, %f301, %f596; add.f32 %f303, %f628, %f636; max.f32 %f304, %f303, %f302; add.f32 %f305, %f625, %f637; max.f32 %f306, %f305, %f601; add.f32 %f307, %f627, %f638; max.f32 %f308, %f307, %f306; add.f32 %f309, %f625, %f639; max.f32 %f310, %f309, %f603; add.f32 %f311, %f627, %f640; max.f32 %f312, %f311, %f310; add.f32 %f313, %f626, %f637; max.f32 %f314, %f313, %f602; add.f32 %f315, %f628, %f638; max.f32 %f316, %f315, %f314; add.f32 %f317, %f626, %f639; max.f32 %f318, %f317, %f604; add.f32 %f319, %f628, %f640; max.f32 %f320, %f319, %f318; add.f32 %f321, %f629, %f633; max.f32 %f322, %f321, %f597; add.f32 %f323, %f631, %f634; max.f32 %f324, %f323, %f322; add.f32 %f325, %f629, %f635; max.f32 %f326, %f325, %f599; add.f32 %f327, %f631, %f636; max.f32 %f328, %f327, %f326; add.f32 %f329, %f630, %f633; max.f32 %f330, %f329, %f598; add.f32 %f331, %f632, %f634; max.f32 %f332, %f331, %f330; add.f32 %f333, %f630, %f635; max.f32 %f334, %f333, %f600; add.f32 %f335, %f632, %f636; max.f32 %f336, %f335, %f334; add.f32 %f337, %f629, %f637; max.f32 %f338, %f337, %f605; add.f32 %f339, %f631, %f638; max.f32 %f340, %f339, %f338; add.f32 %f341, %f629, %f639; max.f32 %f342, %f341, %f607; add.f32 %f343, %f631, %f640; max.f32 %f344, %f343, %f342; add.f32 %f345, %f630, %f637; max.f32 %f346, %f345, %f606; add.f32 %f347, %f632, %f638; max.f32 %f348, %f347, %f346; add.f32 %f349, %f630, %f639; max.f32 %f350, %f349, %f608; add.f32 %f351, %f632, %f640; max.f32 %f352, %f351, %f350; ld.shared.f32 %f353, [%rd314+8]; ld.shared.f32 %f354, [%rd314+12]; ld.shared.f32 %f355, [%rd315+-568]; ld.shared.f32 %f356, [%rd315+-564]; ld.shared.f32 %f357, [%rd315+8]; ld.shared.f32 %f358, [%rd315+12]; ld.shared.f32 %f359, [%rd315+584]; ld.shared.f32 %f360, [%rd315+588]; ld.shared.f32 %f361, [%rd381+-912]; ld.shared.f32 %f362, [%rd381+-640]; ld.shared.f32 %f363, [%rd381+-880]; ld.shared.f32 %f364, [%rd381+-608]; ld.shared.f32 %f365, [%rd381+-848]; ld.shared.f32 %f366, [%rd381+-576]; ld.shared.f32 %f367, [%rd381+-816]; ld.shared.f32 %f368, [%rd381+-544]; add.f32 %f369, %f273, %f281; max.f32 %f370, %f369, %f292; add.f32 %f371, %f274, %f282; max.f32 %f372, %f371, %f370; add.f32 %f373, %f273, %f283; max.f32 %f374, %f373, %f296; add.f32 %f375, %f274, %f284; max.f32 %f376, %f375, %f374; add.f32 %f377, %f275, %f281; max.f32 %f378, %f377, %f300; add.f32 %f379, %f276, %f282; max.f32 %f380, %f379, %f378; add.f32 %f381, %f275, %f283; max.f32 %f382, %f381, %f304; add.f32 %f383, %f276, %f284; max.f32 %f384, %f383, %f382; add.f32 %f385, %f273, %f285; max.f32 %f386, %f385, %f308; add.f32 %f387, %f274, %f286; max.f32 %f388, %f387, %f386; add.f32 %f389, %f273, %f287; max.f32 %f390, %f389, %f312; add.f32 %f391, %f274, %f288; max.f32 %f392, %f391, %f390; add.f32 %f393, %f275, %f285; max.f32 %f394, %f393, %f316; add.f32 %f395, %f276, %f286; max.f32 %f396, %f395, %f394; add.f32 %f397, %f275, %f287; max.f32 %f398, %f397, %f320; add.f32 %f399, %f276, %f288; max.f32 %f400, %f399, %f398; add.f32 %f401, %f277, %f281; max.f32 %f402, %f401, %f324; add.f32 %f403, %f278, %f282; max.f32 %f404, %f403, %f402; add.f32 %f405, %f277, %f283; max.f32 %f406, %f405, %f328; add.f32 %f407, %f278, %f284; max.f32 %f408, %f407, %f406; add.f32 %f409, %f279, %f281; max.f32 %f410, %f409, %f332; add.f32 %f411, %f280, %f282; max.f32 %f412, %f411, %f410; add.f32 %f413, %f279, %f283; max.f32 %f414, %f413, %f336; add.f32 %f415, %f280, %f284; max.f32 %f416, %f415, %f414; add.f32 %f417, %f277, %f285; max.f32 %f418, %f417, %f340; add.f32 %f419, %f278, %f286; max.f32 %f420, %f419, %f418; add.f32 %f421, %f277, %f287; max.f32 %f422, %f421, %f344; add.f32 %f423, %f278, %f288; max.f32 %f424, %f423, %f422; add.f32 %f425, %f279, %f285; max.f32 %f426, %f425, %f348; add.f32 %f427, %f280, %f286; max.f32 %f428, %f427, %f426; add.f32 %f429, %f279, %f287; max.f32 %f430, %f429, %f352; add.f32 %f431, %f280, %f288; max.f32 %f432, %f431, %f430; ld.shared.f32 %f145, [%rd314+16]; ld.shared.f32 %f146, [%rd314+20]; ld.shared.f32 %f147, [%rd315+-560]; ld.shared.f32 %f148, [%rd315+-556]; ld.shared.f32 %f149, [%rd315+16]; ld.shared.f32 %f150, [%rd315+20]; ld.shared.f32 %f151, [%rd315+592]; ld.shared.f32 %f152, [%rd315+596]; ld.shared.f32 %f153, [%rd381+-368]; ld.shared.f32 %f154, [%rd381+-96]; ld.shared.f32 %f155, [%rd381+-336]; ld.shared.f32 %f156, [%rd381+-64]; ld.shared.f32 %f157, [%rd381+-304]; ld.shared.f32 %f158, [%rd381+-32]; ld.shared.f32 %f159, [%rd381+-272]; ld.shared.f32 %f160, [%rd381]; add.f32 %f433, %f353, %f361; max.f32 %f434, %f433, %f372; add.f32 %f435, %f354, %f362; max.f32 %f161, %f435, %f434; add.f32 %f436, %f353, %f363; max.f32 %f437, %f436, %f376; add.f32 %f438, %f354, %f364; max.f32 %f162, %f438, %f437; add.f32 %f439, %f355, %f361; max.f32 %f440, %f439, %f380; add.f32 %f441, %f356, %f362; max.f32 %f163, %f441, %f440; add.f32 %f442, %f355, %f363; max.f32 %f443, %f442, %f384; add.f32 %f444, %f356, %f364; max.f32 %f164, %f444, %f443; add.f32 %f445, %f353, %f365; max.f32 %f446, %f445, %f388; add.f32 %f447, %f354, %f366; max.f32 %f165, %f447, %f446; add.f32 %f448, %f353, %f367; max.f32 %f449, %f448, %f392; add.f32 %f450, %f354, %f368; max.f32 %f166, %f450, %f449; add.f32 %f451, %f355, %f365; max.f32 %f452, %f451, %f396; add.f32 %f453, %f356, %f366; max.f32 %f167, %f453, %f452; add.f32 %f454, %f355, %f367; max.f32 %f455, %f454, %f400; add.f32 %f456, %f356, %f368; max.f32 %f168, %f456, %f455; add.f32 %f457, %f357, %f361; max.f32 %f458, %f457, %f404; add.f32 %f459, %f358, %f362; max.f32 %f169, %f459, %f458; add.f32 %f460, %f357, %f363; max.f32 %f461, %f460, %f408; add.f32 %f462, %f358, %f364; max.f32 %f170, %f462, %f461; add.f32 %f463, %f359, %f361; max.f32 %f464, %f463, %f412; add.f32 %f465, %f360, %f362; max.f32 %f171, %f465, %f464; add.f32 %f466, %f359, %f363; max.f32 %f467, %f466, %f416; add.f32 %f468, %f360, %f364; max.f32 %f172, %f468, %f467; add.f32 %f469, %f357, %f365; max.f32 %f470, %f469, %f420; add.f32 %f471, %f358, %f366; max.f32 %f173, %f471, %f470; add.f32 %f472, %f357, %f367; max.f32 %f473, %f472, %f424; add.f32 %f474, %f358, %f368; max.f32 %f174, %f474, %f473; add.f32 %f475, %f359, %f365; max.f32 %f476, %f475, %f428; add.f32 %f477, %f360, %f366; max.f32 %f175, %f477, %f476; add.f32 %f478, %f359, %f367; max.f32 %f479, %f478, %f432; add.f32 %f480, %f360, %f368; max.f32 %f176, %f480, %f479; setp.eq.s64 %p1, %rd382, 96; @%p1 bra LBB0_3; bra.uni LBB0_5; LBB0_3: // %L11515.3 // in Loop: Header=BB0_2 Depth=2 setp.gt.u64 %p2, %rd379, 1983; bar.sync 0; st.shared.v4.f32 [%rd33], {%f649, %f650, %f651, %f652}; st.shared.v4.f32 [%rd34], {%f653, %f654, %f655, %f656}; st.shared.v4.f32 [%rd35], {%f641, %f642, %f643, %f644}; st.shared.v4.f32 [%rd36], {%f645, %f646, %f647, %f648}; bar.sync 0; @%p2 bra LBB0_5; // %bb.4: // %L14475.3 // in Loop: Header=BB0_2 Depth=2 ld.global.v4.f32 {%f649, %f650, %f651, %f652}, [%rd48]; ld.global.v4.f32 {%f653, %f654, %f655, %f656}, [%rd49]; ld.global.v4.f32 {%f641, %f642, %f643, %f644}, [%rd50]; ld.global.v4.f32 {%f645, %f646, %f647, %f648}, [%rd51]; bra.uni LBB0_5; LBB0_7: // %L18292.L18298_crit_edge st.shared.f32 [%rd12+-272], %f593; st.shared.f32 [%rd13+-272], %f595; st.shared.f32 [%rd14+-272], %f594; st.shared.f32 [%rd15+-272], %f596; st.shared.f32 [%rd16+-272], %f601; st.shared.f32 [%rd17+-272], %f603; st.shared.f32 [%rd18+-272], %f602; st.shared.f32 [%rd19+-272], %f604; st.shared.f32 [%rd20+16], %f597; st.shared.f32 [%rd21+16], %f599; st.shared.f32 [%rd22+2048], %f598; st.shared.f32 [%rd22+4096], %f600; st.shared.f32 [%rd23+16], %f605; st.shared.f32 [%rd24+16], %f607; st.shared.f32 [%rd25+2048], %f606; st.shared.f32 [%rd25+4096], %f608; bar.sync 0; max.s64 %rd348, %rd4, 0; mul.lo.s64 %rd349, %rd6, %rd348; add.s64 %rd350, %rd7, %rd349; ld.shared.v4.f32 {%f529, %f530, %f531, %f532}, [%rd8]; shr.s64 %rd351, %rd350, 63; shr.u64 %rd352, %rd351, 62; add.s64 %rd353, %rd350, %rd352; shl.b64 %rd354, %rd353, 2; and.b64 %rd355, %rd354, -16; add.s64 %rd356, %rd3, %rd355; st.global.v4.f32 [%rd356], {%f529, %f530, %f531, %f532}; shl.b64 %rd357, %rd348, 4; add.s64 %rd358, %rd350, %rd357; ld.shared.v4.f32 {%f533, %f534, %f535, %f536}, [%rd9]; shr.s64 %rd359, %rd358, 63; shr.u64 %rd360, %rd359, 62; add.s64 %rd361, %rd358, %rd360; shl.b64 %rd362, %rd361, 2; and.b64 %rd363, %rd362, -16; add.s64 %rd364, %rd3, %rd363; st.global.v4.f32 [%rd364], {%f533, %f534, %f535, %f536}; add.s64 %rd365, %rd358, %rd357; ld.shared.v4.f32 {%f537, %f538, %f539, %f540}, [%rd10]; shr.s64 %rd366, %rd365, 63; shr.u64 %rd367, %rd366, 62; add.s64 %rd368, %rd365, %rd367; shl.b64 %rd369, %rd368, 2; and.b64 %rd370, %rd369, -16; add.s64 %rd371, %rd3, %rd370; st.global.v4.f32 [%rd371], {%f537, %f538, %f539, %f540}; add.s64 %rd372, %rd365, %rd357; ld.shared.v4.f32 {%f541, %f542, %f543, %f544}, [%rd11]; shr.s64 %rd373, %rd372, 63; shr.u64 %rd374, %rd373, 62; add.s64 %rd375, %rd372, %rd374; shl.b64 %rd376, %rd375, 2; and.b64 %rd377, %rd376, -16; add.s64 %rd378, %rd3, %rd377; st.global.v4.f32 [%rd378], {%f541, %f542, %f543, %f544}; ret; // -- End function