Skip to content
This repository has been archived by the owner on Jan 29, 2025. It is now read-only.

[hlsl-out] Misalligned Float Loads in Push Constants #2095

Closed
cwfitzgerald opened this issue Oct 20, 2022 · 3 comments · Fixed by #2096
Closed

[hlsl-out] Misalligned Float Loads in Push Constants #2095

cwfitzgerald opened this issue Oct 20, 2022 · 3 comments · Fixed by #2096
Labels
area: back-end Outputs of shader conversion lang: HLSL High-Level Shading Language resolution: invalid This doesn't seem right

Comments

@cwfitzgerald
Copy link
Member

cwfitzgerald commented Oct 20, 2022

Very similar to #2093, but likely an unrelated cause.

Found as part of gfx-rs/wgpu#3125.

When loading individual values from any matrix in a push constant, all the offsets loaded are weird. Only when loading individual values is it a problem, not when loading the entire matrix.

Here are the expected vs actual offsets when loading individual floats from matrices.

Actual [0, 4, 2, 6]. Expected [0, 1, 2, 3]. Test ["mat2x2<f32>"]
Actual [0, 4, 8, 4, 8, 12]. Expected [0, 1, 2, 4, 5, 6]. Test ["mat2x3<f32>"]
Actual [0, 4, 8, 12, 4, 8, 12, 16]. Expected [0, 1, 2, 3, 4, 5, 6, 7]. Test ["mat2x4<f32>"]
Actual [0, 4, 2, 6, 4, 8]. Expected [0, 1, 2, 3, 4, 5]. Test ["mat3x2<f32>"]
Actual [0, 4, 8, 4, 8, 12, 8, 12, 16]. Expected [0, 1, 2, 4, 5, 6, 8, 9, 10]. Test ["mat3x3<f32>"]
Actual [0, 4, 8, 12, 4, 8, 12, 16, 8, 12, 16, 20]. Expected [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]. Test ["mat3x4<f32>"]
Actual [0, 4, 2, 6, 4, 8, 6, 10]. Expected [0, 1, 2, 3, 4, 5, 6, 7]. Test ["mat4x2<f32>"]
Actual [0, 4, 8, 4, 8, 12, 8, 12, 16, 12, 16, 20]. Expected [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]. Test ["mat4x3<f32>"]
Actual [0, 4, 8, 12, 4, 8, 12, 16, 8, 12, 16, 20, 12, 16, 20, 24]. Expected [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]. Test ["mat4x4<f32>"]

To give a concrete test case, here are the inputs and outputs for two of the loading styles for push constants.

mat2x2

struct InputStruct {
    member_0: mat2x2<f32>,
}

var<push_constant> input: InputStruct;

@group(0) @binding(1)
var<storage, read_write> output: array<f32>;

@compute @workgroup_size(1)
fn cs_main() {
    let loaded = input;
    var i = 0u;
    output[i] = input.member_0[0].x;
    i += 1u;
    output[i] = input.member_0[0].y;
    i += 1u;
    output[i] = input.member_0[1].x;
    i += 1u;
    output[i] = input.member_0[1].y;
    i += 1u;
}
struct NagaConstants {
    int base_vertex;
    int base_instance;
    uint other;
};
ConstantBuffer<NagaConstants> _NagaConstants: register(b1, space1);

struct InputStruct {
    float2 member_0__0; float2 member_0__1;
};

ConstantBuffer<InputStruct> input: register(b0);
RWByteAddressBuffer output : register(u0);

float2x2 GetMatmember_0_OnInputStruct(InputStruct obj) {
    return float2x2(obj.member_0__0, obj.member_0__1);
}

void SetMatmember_0_OnInputStruct(InputStruct obj, float2x2 mat) {
    obj.member_0__0 = mat[0];
    obj.member_0__1 = mat[1];
}

void SetMatVecmember_0_OnInputStruct(InputStruct obj, float2 vec, uint mat_idx) {
    switch(mat_idx) {
    case 0: { obj.member_0__0 = vec; break; }
    case 1: { obj.member_0__1 = vec; break; }
    }
}

void SetMatScalarmember_0_OnInputStruct(InputStruct obj, float scalar, uint mat_idx, uint vec_idx) {
    switch(mat_idx) {
    case 0: { obj.member_0__0[vec_idx] = scalar; break; }
    case 1: { obj.member_0__1[vec_idx] = scalar; break; }
    }
}

[numthreads(1, 1, 1)]
void cs_main()
{
    uint i = 0u;

    InputStruct loaded = input;
    uint _expr5 = i;
    float _expr11 = GetMatmember_0_OnInputStruct(input)[0][0];
    output.Store(_expr5*4, asuint(_expr11));
    uint _expr12 = i;
    i = (_expr12 + 1u);
    uint _expr15 = i;
    float _expr21 = GetMatmember_0_OnInputStruct(input)[0][1];
    output.Store(_expr15*4, asuint(_expr21));
    uint _expr22 = i;
    i = (_expr22 + 1u);
    uint _expr25 = i;
    float _expr31 = GetMatmember_0_OnInputStruct(input)[1][0];
    output.Store(_expr25*4, asuint(_expr31));
    uint _expr32 = i;
    i = (_expr32 + 1u);
    uint _expr35 = i;
    float _expr41 = GetMatmember_0_OnInputStruct(input)[1][1];
    output.Store(_expr35*4, asuint(_expr41));
    uint _expr42 = i;
    i = (_expr42 + 1u);
    return;
}

mat2x3

struct InputStruct {
    member_0: mat2x3<f32>,
}

var<push_constant> input: InputStruct;

@group(0) @binding(1)
var<storage, read_write> output: array<f32>;

@compute @workgroup_size(1)
fn cs_main() {
    let loaded = input;
    var i = 0u;
    output[i] = input.member_0[0].x;
    i += 1u;
    output[i] = input.member_0[0].y;
    i += 1u;
    output[i] = input.member_0[0].z;
    i += 1u;
    output[i] = input.member_0[1].x;
    i += 1u;
    output[i] = input.member_0[1].y;
    i += 1u;
    output[i] = input.member_0[1].z;
    i += 1u;
}
struct NagaConstants {
    int base_vertex;
    int base_instance;
    uint other;
};
ConstantBuffer<NagaConstants> _NagaConstants: register(b1, space1);

struct InputStruct {
    row_major float2x3 member_0_;
    int _end_pad_0;
};

ConstantBuffer<InputStruct> input: register(b0);
RWByteAddressBuffer output : register(u0);

[numthreads(1, 1, 1)]
void cs_main()
{
    uint i = 0u;

    InputStruct loaded = input;
    uint _expr5 = i;
    float _expr11 = input.member_0_[0][0];
    output.Store(_expr5*4, asuint(_expr11));
    uint _expr12 = i;
    i = (_expr12 + 1u);
    uint _expr15 = i;
    float _expr21 = input.member_0_[0][1];
    output.Store(_expr15*4, asuint(_expr21));
    uint _expr22 = i;
    i = (_expr22 + 1u);
    uint _expr25 = i;
    float _expr31 = input.member_0_[0][2];
    output.Store(_expr25*4, asuint(_expr31));
    uint _expr32 = i;
    i = (_expr32 + 1u);
    uint _expr35 = i;
    float _expr41 = input.member_0_[1][0];
    output.Store(_expr35*4, asuint(_expr41));
    uint _expr42 = i;
    i = (_expr42 + 1u);
    uint _expr45 = i;
    float _expr51 = input.member_0_[1][1];
    output.Store(_expr45*4, asuint(_expr51));
    uint _expr52 = i;
    i = (_expr52 + 1u);
    uint _expr55 = i;
    float _expr61 = input.member_0_[1][2];
    output.Store(_expr55*4, asuint(_expr61));
    uint _expr62 = i;
    i = (_expr62 + 1u);
    return;
}
@cwfitzgerald cwfitzgerald added kind: bug Something isn't working area: back-end Outputs of shader conversion lang: HLSL High-Level Shading Language labels Oct 20, 2022
@cwfitzgerald
Copy link
Member Author

cwfitzgerald commented Oct 20, 2022

Alright, I'm getting FXC bug vibes from this.

AMD and Nvidia give different answers from Intel, and WARP segfaults.

FXC generates some DXBC which looks totally wacky and not remotely sane https://shader-playground.timjones.io/0cc29a22f8b96926ba3ad2cbb15bcbd3

cs_5_1
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[0:0][1], immediateIndexed, space=0
dcl_uav_raw U0[0:0], space=0
dcl_temps 1
dcl_thread_group 1, 1, 1
mov r0.xz, CB0[0][0].xxzx
mov r0.yw, CB0[1][0].xxxz
store_raw U0[0].xyzw, l(0), r0.xyzw
ret

Compare that to the generated DXIL which looks sane, loading [0, 1, 2, 3]:

define void @cs_main() {
  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
  %2 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 13, i32 16 })  ; AnnotateHandle(res,props)  resource: CBuffer
  %4 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
  %5 = extractvalue %dx.types.CBufRet.f32 %4, 0
  %6 = bitcast float %5 to i32
  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %7, i32 0, i32 undef, i32 %6, i32 undef, i32 undef, i32 undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
  %8 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
  %9 = extractvalue %dx.types.CBufRet.f32 %8, 1
  %10 = bitcast float %9 to i32
  %11 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %11, i32 4, i32 undef, i32 %10, i32 undef, i32 undef, i32 undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
  %12 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
  %13 = extractvalue %dx.types.CBufRet.f32 %12, 2
  %14 = bitcast float %13 to i32
  %15 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %15, i32 8, i32 undef, i32 %14, i32 undef, i32 undef, i32 undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
  %16 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
  %17 = extractvalue %dx.types.CBufRet.f32 %16, 3
  %18 = bitcast float %17 to i32
  %19 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
  call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %19, i32 12, i32 undef, i32 %18, i32 undef, i32 undef, i32 undef, i8 1, i32 4)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
  ret void
}

@cwfitzgerald cwfitzgerald added resolution: invalid This doesn't seem right and removed kind: bug Something isn't working labels Oct 20, 2022
@cwfitzgerald
Copy link
Member Author

Further argument for gfx-rs/wgpu#2722

@cwfitzgerald
Copy link
Member Author

The issue appears to be double indexing the matrix like [0][0] as the push constant is looking through a ValuePointer. FXC handles the code fine if using the [0].x syntax.

cwfitzgerald added a commit to cwfitzgerald/naga that referenced this issue Oct 20, 2022
cwfitzgerald added a commit to cwfitzgerald/naga that referenced this issue Oct 20, 2022
cwfitzgerald added a commit to cwfitzgerald/naga that referenced this issue Oct 20, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area: back-end Outputs of shader conversion lang: HLSL High-Level Shading Language resolution: invalid This doesn't seem right
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant