Expose AVX512F embedded rounding intrinsics. #97415

Ruihan-Yin · 2024-01-23T20:27:19Z

Follow-up of #94684.

Updates on 02/14/24:

This PR exposes embedded rounding enabled APIs in AVX512:

The followings are from #73604

public partial class Avx512F
{
    public static Vector512<double> Add(Vector512<double> left, Vector512<double> right, FloatRoundingMode mode);
    public static Vector512<float> Add(Vector512<float> left, Vector512<float> right, FloatRoundingMode mode);

    public static Vector128<double> AddScalar(Vector128<double> left, Vector128<double> right, FloatRoundingMode mode);
    public static Vector128<float> AddScalar(Vector128<float> left, Vector128<float> right, FloatRoundingMode mode);

    public static Vector128<float> ConvertScalarToVector128Single(Vector128<float> upper, Vector128<double> value, FloatRoundingMode mode);
    public static Vector128<double> ConvertScalarToVector128Double(Vector128<double> upper, Vector128<float> value, FloatRoundingMode mode);

    public static int ConvertToInt32(Vector128<double> value, FloatRoundingMode mode);
    public static int ConvertToInt32(Vector128<float> value, FloatRoundingMode mode);

    public static Vector256<int> ConvertToVector256Int32(Vector512<double> value, FloatRoundingMode mode);
    public static Vector512<float> ConvertToVector512Single(Vector512<int> value, FloatRoundingMode mode);
    public static Vector256<float> ConvertToVector256Single(Vector512<double> value, FloatRoundingMode mode);

    public static Vector512<int> ConvertToVector512Int32(Vector512<float> value, FloatRoundingMode mode);

    public static Vector128<float> ConvertScalarToVector128Single(Vector128<float> upper, int value, FloatRoundingMode mode);

    public static Vector512<double> Divide(Vector512<double> left, Vector512<double> right, FloatRoundingMode mode);
    public static Vector512<float> Divide(Vector512<float> left, Vector512<float> right, FloatRoundingMode mode);

    public static Vector128<double> DivideScalar(Vector128<double> left, Vector128<double> right, FloatRoundingMode mode);
    public static Vector128<float> DivideScalar(Vector128<float> left, Vector128<float> right, FloatRoundingMode mode);


    public static Vector512<double> Multiply(Vector512<double> left, Vector512<double> right, FloatRoundingMode mode);
    public static Vector512<float> Multiply(Vector512<float> left, Vector512<float> right, FloatRoundingMode mode);

    public static Vector128<double> MultiplyScalar(Vector128<double> left, Vector128<double> right, FloatRoundingMode mode);
    public static Vector128<float> MultiplyScalar(Vector128<float> left, Vector128<float> right, FloatRoundingMode mode);

    public static Vector512<double> Sqrt(Vector512<double> value, FloatRoundingMode mode);
    public static Vector512<float> Sqrt(Vector512<float> value, FloatRoundingMode mode);

    public static Vector128<double> SqrtScalar(Vector128<double> upper, Vector128<double> value, FloatRoundingMode mode);
    public static Vector128<float> SqrtScalar(Vector128<float> upper, Vector128<float> value, FloatRoundingMode mode);

    public static Vector512<double> Subtract(Vector512<double> left, Vector512<double> right, FloatRoundingMode mode);
    public static Vector512<float> Subtract(Vector512<float> left, Vector512<float> right, FloatRoundingMode mode);

    public static Vector128<double> SubtractScalar(Vector128<double> left, Vector128<double> right, FloatRoundingMode mode);
    public static Vector128<float> SubtractScalar(Vector128<float> left, Vector128<float> right, FloatRoundingMode mode);

    // AVX512

    public static uint ConvertToUInt32(Vector128<double> value, FloatRoundingMode mode);
    public static uint ConvertToUInt32(Vector128<float> value, FloatRoundingMode mode);

    public static Vector256<uint> ConvertToVector256UInt32(Vector512<double> value, FloatRoundingMode mode);
    public static Vector512<uint> ConvertToVector512UInt32(Vector512<float> value, FloatRoundingMode mode);

    public partial class X64
    {
        public static long ConvertToInt64(Vector128<double> value, FloatRoundingMode mode);
        public static long ConvertToInt64(Vector128<float> value, FloatRoundingMode mode);

        public static Vector128<double> ConvertScalarToVector128Double(Vector128<double> upper, long value, FloatRoundingMode mode);
        public static Vector128<float> ConvertScalarToVector128Single(Vector128<float> upper, long value, FloatRoundingMode mode);

        // AVX512

        public static ulong ConvertToUInt64(Vector128<double> value, FloatRoundingMode mode);
        public static ulong ConvertToUInt64(Vector128<float> value, FloatRoundingMode mode);

        public static Vector128<double> ConvertScalarToVector128Double(Vector128<double> upper, ulong value, FloatRoundingMode mode);
        public static Vector128<float> ConvertScalarToVector128Single(Vector128<float> upper, ulong value, FloatRoundingMode mode);
    }
}

The followings are suggested by the reviewer: from AVX512 surfaces tracked by other issues.

public partial class Avx512F
{
    public static Vector512<double> FusedMultiplyAdd (Vector512<double> a, Vector512<double> b, Vector512<double> c, FloatRoundingMode mode);
    public static Vector512<float> FusedMultiplyAdd (Vector512<float> a, Vector512<float> b, Vector512<float> c, FloatRoundingMode mode);
    public static Vector512<double> FusedMultiplyAddNegated (Vector512<double> a, Vector512<double> b, Vector512<double> c, FloatRoundingMode mode);
    public static Vector512<float> FusedMultiplyAddNegated (Vector512<float> a, Vector512<float> b, Vector512<float> c, FloatRoundingMode mode);
    public static Vector512<double> FusedMultiplySubtract (Vector512<double> a, Vector512<double> b, Vector512<double> c, FloatRoundingMode mode);
    public static Vector512<float> FusedMultiplySubtract (Vector512<float> a, Vector512<float> b, Vector512<float> c, FloatRoundingMode mode);
    public static Vector512<double> FusedMultiplySubtractAdd (Vector512<double> a, Vector512<double> b, Vector512<double> c, FloatRoundingMode mode);
    public static Vector512<float> FusedMultiplySubtractAdd (Vector512<float> a, Vector512<float> b, Vector512<float> c, FloatRoundingMode mode);
    public static Vector512<double> FusedMultiplySubtractNegated (Vector512<double> a, Vector512<double> b, Vector512<double> c, FloatRoundingMode mode);
    public static Vector512<float> FusedMultiplySubtractNegated (Vector512<float> a, Vector512<float> b, Vector512<float> c, FloatRoundingMode mode);
    public static Vector128<double> FusedMultiplyAddScalar (Vector128<double> a, Vector128<double> b, Vector128<double> c, FloatRoundingMode mode);
    public static Vector128<float> FusedMultiplyAddScalar (Vector128<float> a, Vector128<float> b, Vector128<float> c, FloatRoundingMode mode);
    public static Vector128<double> FusedMultiplyAddNegatedScalar (Vector128<double> a, Vector128<double> b, Vector128<double> c, FloatRoundingMode mode);
    public static Vector128<float> FusedMultiplyAddNegatedScalar (Vector128<float> a, Vector128<float> b, Vector128<float> c, FloatRoundingMode mode);
    public static Vector128<double> FusedMultiplySubtractScalar (Vector128<double> a, Vector128<double> b, Vector128<double> c, FloatRoundingMode mode);
    public static Vector128<float> FusedMultiplySubtractScalar (Vector128<float> a, Vector128<float> b, Vector128<float> c, FloatRoundingMode mode);
    public static Vector128<double> FusedMultiplySubtractNegatedScalar (Vector128<double> a, Vector128<double> b, Vector128<double> c, FloatRoundingMode mode);
    public static Vector128<float> FusedMultiplySubtractNegatedScalar (Vector128<float> a, Vector128<float> b, Vector128<float> c, FloatRoundingMode mode);
    public static Vector128<double> ScaleScalar (Vector128<double> left, Vector128<double> right, FloatRoundingMode mode);
    public static Vector128<float> ScaleScalar (Vector128<float> left, Vector128<float> right, FloatRoundingMode mode);
}
public partial class Avx512DQ
{
    public static Vector256<float> ConvertToVector256Single (Vector512<long> value, FloatRoundingMode mode);
    public static Vector256<float> ConvertToVector256Single (Vector512<ulong> value, FloatRoundingMode mode);
    public static Vector512<double> ConvertToVector512Double (Vector512<long> value, FloatRoundingMode mode);
    public static Vector512<double> ConvertToVector512Double (Vector512<ulong> value, FloatRoundingMode mode);
    public static Vector512<long> ConvertToVector512Int64 (Vector512<double> value, FloatRoundingMode mode);
    public static Vector512<long> ConvertToVector512Int64 (Vector256<float> value, FloatRoundingMode mode);
    public static Vector512<ulong> ConvertToVector512UInt64 (Vector512<double> value, FloatRoundingMode mode);
    public static Vector512<ulong> ConvertToVector512UInt64 (Vector256<float> value, FloatRoundingMode mode);
}

dotnet-issue-labeler · 2024-01-23T20:27:27Z

Note regarding the new-api-needs-documentation label:

This serves as a reminder for when your PR is modifying a ref *.cs file and adding/modifying public APIs, please make sure the API implementation in the src *.cs file is documented with triple slash comments, so the PR reviewers can sign off that change.

ghost · 2024-01-23T20:27:51Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Follow-up of #94684, still WIP, no need to review at the moment.

Author:	Ruihan-Yin
Assignees:	-
Labels:	`area-CodeGen-coreclr`, `new-api-needs-documentation`, `community-contribution`
Milestone:	-

ryujit-bot · 2024-01-24T15:09:49Z

Diff results for #97415

Throughput diffs

Throughput diffs for linux/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
libraries.crossgen2.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

Details here

ryujit-bot · 2024-01-25T00:16:12Z

Diff results for #97415

Throughput diffs

Throughput diffs for linux/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
libraries.crossgen2.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

Details here

Ruihan-Yin · 2024-01-25T01:09:36Z

Failures look unrelated, PR is ready for review.

ryujit-bot · 2024-01-29T22:41:37Z

Diff results for #97415

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.linux.arm64.checked.mch	+0.01%

Throughput diffs for linux/x64 ran on windows/x64

MinOpts (0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.windows.arm64.checked.mch	+0.01%

Throughput diffs for windows/x64 ran on windows/x64

MinOpts (0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%

Details here

ryujit-bot · 2024-01-29T23:41:50Z

Diff results for #97415

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.linux.arm64.checked.mch	+0.01%

Throughput diffs for linux/x64 ran on windows/x64

MinOpts (0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.windows.arm64.checked.mch	+0.01%

Throughput diffs for windows/x64 ran on windows/x64

MinOpts (0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%

Details here

ryujit-bot · 2024-01-30T20:44:18Z

Diff results for #97415

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.linux.arm64.checked.mch	-0.01%

Throughput diffs for linux/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

Details here

Ruihan-Yin · 2024-01-30T21:36:15Z

Applied the format patch. All failures are known.

ryujit-bot · 2024-01-30T21:44:39Z

Diff results for #97415

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.linux.arm64.checked.mch	-0.01%

Throughput diffs for linux/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/x64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

Details here

tannergooding · 2024-02-01T17:45:34Z

Updated the branch to pick up some CI fixes that have gone in.

src/coreclr/jit/hwintrinsic.h

src/coreclr/jit/codegen.h

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Ruihan-Yin · 2024-02-01T18:02:29Z

Looks like there are a few new updates, I will rebase my local branch and resolve the build error.

I think I forgot to mention there were a few APIs that the underlying intrinisics only support Suppress All Exceptions (sae)

For example, I provided one in the commit:

/// __m128d _mm_cvt_roundss_sd (__m128d a, __m128 b, int sae)
///   VCVTSS2SD xmm1, xmm2, xmm3 {sae}
public static Vector128<double> ConvertScalarToVector128Double(Vector128<double> upper, Vector128<float> value, [ConstantExpected(Max = FloatRoundingMode.ToZero)] FloatRoundingMode mode) => ConvertScalarToVector128Double(upper, value, mode);

There are some more I haven't exposed, because the sae parameter acts more like a on-off switch. I am not very sure how we gonna deal with them.

1. remove some redundent checks on embedded rounding intrinsics

pass the correct operand GenTree node, when emitting the fallback for embedded rounding intrinsics.

2. Added FMA intrinsics with embedded rounding and unit tests.

ryujit-bot · 2024-02-14T01:14:54Z

Diff results for #97415

Throughput diffs

Throughput diffs for osx/arm64 ran on linux/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.osx.arm64.checked.mch	-0.01%

Details here

ryujit-bot · 2024-02-14T02:15:06Z

Diff results for #97415

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Diffs are based on 2,531,978 contexts (984,938 MinOpts, 1,547,040 FullOpts).

MISSED contexts: 1 (0.00%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	406,618,438	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	127,431,919	+8

Assembly diffs for windows/x64 ran on windows/x64

Diffs are based on 2,804,171 contexts (1,155,877 MinOpts, 1,648,294 FullOpts).

MISSED contexts: 3,198 (0.11%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	439,649,126	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	135,055,052	+8

Details here

Assembly diffs for windows/x86 ran on windows/x86

Diffs are based on 2,599,926 contexts (1,005,474 MinOpts, 1,594,452 FullOpts).

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	348,964,431	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	113,859,455	+8

Details here

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.linux.arm64.checked.mch	-0.01%

Throughput diffs for linux/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for osx/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.osx.arm64.checked.mch	-0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.windows.arm64.checked.mch	-0.01%

Throughput diffs for windows/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

Details here

ryujit-bot · 2024-02-14T03:15:21Z

Diff results for #97415

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Diffs are based on 2,531,978 contexts (984,938 MinOpts, 1,547,040 FullOpts).

MISSED contexts: 1 (0.00%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	406,618,438	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	127,431,919	+8

Assembly diffs for windows/x64 ran on windows/x64

Diffs are based on 2,804,171 contexts (1,155,877 MinOpts, 1,648,294 FullOpts).

MISSED contexts: 3,198 (0.11%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	439,649,126	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	135,055,052	+8

Details here

Assembly diffs for windows/x86 ran on windows/x86

Diffs are based on 2,599,926 contexts (1,005,474 MinOpts, 1,594,452 FullOpts).

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	348,964,431	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	113,859,455	+8

Details here

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.linux.arm64.checked.mch	-0.01%

Throughput diffs for linux/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for osx/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.osx.arm64.checked.mch	-0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
libraries.pmi.windows.arm64.checked.mch	-0.01%

Throughput diffs for windows/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

Details here

Ruihan-Yin · 2024-02-14T17:20:36Z

Failures look unrelated, will apply the format patch, ready to continue the review.

ryujit-bot · 2024-02-14T19:54:24Z

Diff results for #97415

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Diffs are based on 2,531,978 contexts (984,938 MinOpts, 1,547,040 FullOpts).

MISSED contexts: 1 (0.00%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	406,611,323	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	127,424,804	+8

Assembly diffs for windows/x64 ran on windows/x64

Diffs are based on 2,821,026 contexts (1,163,479 MinOpts, 1,657,547 FullOpts).

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	439,646,022	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	135,051,948	+8

Details here

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.linux.arm64.checked.mch	+0.01%

Throughput diffs for linux/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.01% to +0.01%)

Collection	PDIFF
libraries.pmi.windows.arm64.checked.mch	+0.01%
realworld.run.windows.arm64.checked.mch	-0.01%

Throughput diffs for windows/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

Details here

ryujit-bot · 2024-02-14T20:54:32Z

Diff results for #97415

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Diffs are based on 2,531,978 contexts (984,938 MinOpts, 1,547,040 FullOpts).

MISSED contexts: 1 (0.00%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	406,611,323	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	127,424,804	+8

Assembly diffs for windows/x64 ran on windows/x64

Diffs are based on 2,821,026 contexts (1,163,479 MinOpts, 1,657,547 FullOpts).

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	439,646,022	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	135,051,948	+8

Details here

Assembly diffs for windows/x86 ran on windows/x86

Diffs are based on 2,599,259 contexts (1,005,474 MinOpts, 1,593,785 FullOpts).

MISSED contexts: 667 (0.03%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	348,866,853	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	113,761,877	+8

Details here

tannergooding · 2024-02-14T21:10:23Z

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

+            // For FMA intrinsics, since it is not possible to get any contained operand in this case: embedded rounding
+            // is limited in register-to-register form, and the control byte is dynamic, we don't need to do any swap.


Not quite sure I follow this.

I'd have expected we still want to do instruction selection based on the target register to avoid having to insert the additional movaps. This ensures a last use parameter can still be choosen, regardless of whether its op1, op2, or op3

I'm fine with this being handled in a follow up PR, so we can get this merged.

But I do think we need to ensure its handled so we get optimal codegen.

It may actually be sufficient to just call genFMAIntrinsic here instead of genHWIntrinsic_R_R_R_RM (or rather to split out the core of it so we don't call genProduceReg too many times).

I might misunderstand the original emit path for FMA, let me check if this would be a quick fix, thanks for pointing out.

The same general thing might apply to the other paths, where it'd be "more ideal" if we could call genAvxFamily for things like NI_AVX512F_ConvertToVector256Int32, to ensure we aren't missing handling.

The only reason we "can't" today is because they force the genProduceReg call, but that could be extracted or conditioned in a way to make that safe (maybe just if (instOptions == NONE) { genProduceReg(..); } or similar).

Still fine with it being handled in a separate PR, as what's here isn't incorrect, just potentially less optimal long term

tannergooding · 2024-02-14T21:17:13Z

src/coreclr/jit/lsraxarch.cpp

@@ -2506,6 +2509,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
                srcCount += BuildDelayFreeUses(emitOp2, emitOp1);
                srcCount += emitOp3->isContained() ? BuildOperandUses(emitOp3) : BuildDelayFreeUses(emitOp3, emitOp1);

+                if (intrinsicTree->OperIsEmbRoundingEnabled() && !intrinsicTree->Op(4)->IsCnsIntOrI())
+                {
+                    srcCount += BuildDelayFreeUses(intrinsicTree->Op(4), emitOp1);


I don't think we need BuildDelayFreeUses here. op(4) is an integer and so will go in a GPR. op1 is a vector and so will go in a SIMD register

I have limited understand within LSRA domain, it would be much appreciated if you could point me to the reasonable function to use....

Like BuildOperandUse might be the correct one?

For this case you should just need BuildOperandUses(intrinsicTree->Op(4))

The general premise is:

BuildOperandUses -- safe to use when there aren't any restrictions

BuildDelayFreeUses -- safe to use when you need to restrict the register set, typically used for RMW nodes

In this case, FMA is an RMW node. However, the register sets used by op4 (simd) and op1 (general-purpose) don't overlap, so we don't need to concern ourselves with op4 being delay free.

Thanks for the explanation!

tannergooding

This needs secondary sign-off from someone from @dotnet/jit-contrib

BruceForstall · 2024-02-14T23:15:02Z

@Ruihan-Yin The top comment here says "Follow-up of #94684, still WIP, no need to review at the moment.". Can you please update this to describe the contents of the PR?

Ruihan-Yin · 2024-02-14T23:34:48Z

@Ruihan-Yin The top comment here says "Follow-up of #94684, still WIP, no need to review at the moment.". Can you please update this to describe the contents of the PR?

Done, thanks for the notice.

ryujit-bot · 2024-02-15T02:55:01Z

Diff results for #97415

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Diffs are based on 2,531,978 contexts (984,938 MinOpts, 1,547,040 FullOpts).

MISSED contexts: 1 (0.00%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	406,611,323	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.linux.x64.checked.mch	127,424,804	+8

Assembly diffs for windows/x64 ran on windows/x64

Diffs are based on 2,821,026 contexts (1,163,479 MinOpts, 1,657,547 FullOpts).

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	439,646,022	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x64.checked.mch	135,051,948	+8

Details here

Assembly diffs for windows/x86 ran on windows/x86

Diffs are based on 2,599,259 contexts (1,005,474 MinOpts, 1,593,785 FullOpts).

MISSED contexts: 667 (0.03%)

Overall (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	348,866,853	+8

FullOpts (+8 bytes)

Collection	Base size (bytes)	Diff size (bytes)
coreclr_tests.run.windows.x86.checked.mch	113,761,877	+8

Details here

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
realworld.run.linux.arm64.checked.mch	-0.01%

Throughput diffs for linux/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.x64.checked.mch	+0.01%
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.linux.x64.checked.mch	+0.01%
benchmarks.run_tiered.linux.x64.checked.mch	+0.01%
coreclr_tests.run.linux.x64.checked.mch	+0.01%
libraries.crossgen2.linux.x64.checked.mch	+0.01%
libraries_tests.run.linux.x64.Release.mch	+0.01%
realworld.run.linux.x64.checked.mch	+0.01%
smoke_tests.nativeaot.linux.x64.checked.mch	+0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.01% to +0.00%)

Collection	PDIFF
realworld.run.windows.arm64.checked.mch	-0.01%

Throughput diffs for windows/x64 ran on windows/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.x64.checked.mch	+0.01%
libraries.crossgen2.windows.x64.checked.mch	+0.01%
smoke_tests.nativeaot.windows.x64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_pgo.windows.x64.checked.mch	+0.01%

Details here

Ruihan-Yin · 2024-02-15T22:41:32Z

Thanks for all the reviews and help!

ghost added the community-contribution Indicates that the PR has been added by a community member label Jan 23, 2024

dotnet-issue-labeler bot added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI new-api-needs-documentation labels Jan 23, 2024

Ruihan-Yin force-pushed the EmbRoundingAPIs branch 2 times, most recently from dfee205 to 5c44b41 Compare January 23, 2024 21:41

This was referenced Jan 25, 2024

Tests crashing in CI with no dump: exit code 137 means SIGKILL Killed #97049

Closed

GC/API/Refresh/Refresh failing in outer loop runs #97437

Closed

Ruihan-Yin marked this pull request as ready for review January 25, 2024 01:09

Ruihan-Yin force-pushed the EmbRoundingAPIs branch from cc54966 to 5f0c7eb Compare January 29, 2024 20:48

build-analysis bot mentioned this pull request Jan 30, 2024

Failed USB connection via port 54050, error 61, in tvOS arm64 Release AllSubsets_Mono #82637

Open

build-analysis bot mentioned this pull request Jan 31, 2024

Assertion failed 'hwintrinsicChild->isContained()' in 'System.Numerics.Tensors.TensorPrimitives+ScaleBOperator #97688

Closed

tannergooding reviewed Feb 1, 2024

View reviewed changes

src/coreclr/jit/hwintrinsic.h Outdated Show resolved Hide resolved

tannergooding reviewed Feb 1, 2024

View reviewed changes

src/coreclr/jit/hwintrinsic.h Outdated Show resolved Hide resolved

tannergooding reviewed Feb 1, 2024

View reviewed changes

src/coreclr/jit/codegen.h Outdated Show resolved Hide resolved

tannergooding reviewed Feb 1, 2024

View reviewed changes

src/coreclr/jit/hwintrinsiccodegenxarch.cpp Show resolved Hide resolved

Ruihan-Yin added 6 commits February 13, 2024 13:51

resolve comments:

38191d3

1. remove some redundent checks on embedded rounding intrinsics

Bug fix:

061a8dd

pass the correct operand GenTree node, when emitting the fallback for embedded rounding intrinsics.

formatting

1367a9a

revert an unexpected change.

7de90b2

1.Resolve comments:

d0c805c

2. Added FMA intrinsics with embedded rounding and unit tests.

Expose the rest of embedded rounding APIs

fd5a2d5

Ruihan-Yin force-pushed the EmbRoundingAPIs branch from fa83b08 to fd5a2d5 Compare February 13, 2024 23:40

Ruihan-Yin requested a review from tannergooding February 14, 2024 17:20

formatting

aa13249

build-analysis bot mentioned this pull request Feb 14, 2024

[mt][browser] HttpClient_CancelInDifferentThread failing with operation cancelled #98101

Open

tannergooding reviewed Feb 14, 2024

View reviewed changes

tannergooding approved these changes Feb 14, 2024

View reviewed changes

Ensure the control byte local is assigned to the correct register.

7ee7749

This was referenced Feb 15, 2024

NuGet failing with Response status code does not indicate success: 503 (Service Unavailable) dotnet/arcade#11723

Open

System.Text.Json failing some large file tests #59678

Closed

[browser][MT] Assert failed: Cannot find Promise for JSHandle -2 #98406

Closed

BruceForstall approved these changes Feb 15, 2024

View reviewed changes

BruceForstall merged commit aeecdb8 into dotnet:main Feb 15, 2024
206 of 208 checks passed

github-actions bot locked and limited conversation to collaborators Mar 17, 2024

		// For FMA intrinsics, since it is not possible to get any contained operand in this case: embedded rounding
		// is limited in register-to-register form, and the control byte is dynamic, we don't need to do any swap.

Expose AVX512F embedded rounding intrinsics. #97415

Expose AVX512F embedded rounding intrinsics. #97415

Conversation

Ruihan-Yin commented Jan 23, 2024 • edited Loading

dotnet-issue-labeler bot commented Jan 23, 2024

ghost commented Jan 23, 2024

ryujit-bot commented Jan 24, 2024

Throughput diffs

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

ryujit-bot commented Jan 25, 2024

Throughput diffs

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

Ruihan-Yin commented Jan 25, 2024

ryujit-bot commented Jan 29, 2024

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for windows/arm64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

ryujit-bot commented Jan 29, 2024

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for windows/arm64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

ryujit-bot commented Jan 30, 2024

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

Ruihan-Yin commented Jan 30, 2024

ryujit-bot commented Jan 30, 2024

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

tannergooding commented Feb 1, 2024

Ruihan-Yin commented Feb 1, 2024

ryujit-bot commented Feb 14, 2024

Throughput diffs

Throughput diffs for osx/arm64 ran on linux/x64

ryujit-bot commented Feb 14, 2024

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Assembly diffs for windows/x64 ran on windows/x64

Assembly diffs for windows/x86 ran on windows/x86

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for osx/arm64 ran on windows/x64

Throughput diffs for windows/arm64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

ryujit-bot commented Feb 14, 2024

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Assembly diffs for windows/x64 ran on windows/x64

Assembly diffs for windows/x86 ran on windows/x86

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for osx/arm64 ran on windows/x64

Throughput diffs for windows/arm64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

Ruihan-Yin commented Feb 14, 2024

ryujit-bot commented Feb 14, 2024

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Assembly diffs for windows/x64 ran on windows/x64

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/x64 ran on windows/x64

Throughput diffs for windows/arm64 ran on windows/x64

Throughput diffs for windows/x64 ran on windows/x64

ryujit-bot commented Feb 14, 2024

Assembly diffs

Assembly diffs for linux/x64 ran on windows/x64

Assembly diffs for windows/x64 ran on windows/x64

Assembly diffs for windows/x86 ran on windows/x86

Ruihan-Yin commented Jan 23, 2024 •

edited

Loading

tannergooding Feb 14, 2024 •

edited

Loading

Ruihan-Yin Feb 14, 2024 •

edited

Loading