diff --git a/docs/basic/performance.md b/docs/basic/performance.md index e1fff2f9b..c46a1cefc 100644 --- a/docs/basic/performance.md +++ b/docs/basic/performance.md @@ -4,30 +4,35 @@ ## 测试报告 -社区版本的HybridCLR除了数值计算跟lua持平之外,其他方面数据均大幅(数倍到数十倍)优于lua方案。 +社区版本的HybridCLR除了**数值计算**跟AOT有较明显差距外,其他方面差距不大。因此对于大多数项目来说,游戏综合性能跟全原生版本差距不大。 -**商业版本**的HybridCLR大幅优化了数值计算性能,有近300%的性能提升,其他大多数普通指令也有50%-200%的性能提升,对性能有严苛要求的开发者可以联系我们[商业化服务](../business/intro.md)。 +**商业版本**的HybridCLR大幅优化了数值计算性能,性能是社区版本的280%-735%,对性能有严苛要求的开发者可以联系我们[商业化服务](../business/intro.md)。 -以下是社区版本的HybridCLR在iphone 11及小米5C手机下的实机测试报告,测试代码附录最后。 +以下是OnePlus 9R ArmV8 实机测试报告,测试代码附录最后。 -:::caution -注意,test2、test8、test9的测试用例极不合理,AOT情况下会被编译器完全优化掉,导致时间为0。真实差距 -应该在10-30倍之间。 -::: +### AOT耗时 vs 商业化版本耗时 vs 社区版本耗时 (越小越好) -AOT 行是原生il2cpp的数据。HotFix 行是HybridCLR的数据。Lua 行是xlua的数据。 +![data](/img/benchmark/numeric_datas.jpg) -![iphone11](/img/hybridclr/benchmark_iphone11.png) +### 商业化版本耗时/AOT耗时 vs 社区版本耗时/AOT耗时 (越小越好) -![xiaomi5c](/img/hybridclr/benchmark_xiaomi.png) +AOT版本性能是社区版本的`4.1 - 90`倍,是商业化版本的`1.30 - 12.9`倍。 -以下是部分测试用例下的商业化版本相比于社区版本的性能提升数据。 +![data](/img/benchmark/numeric_business_vs_aot_div_aot.jpg) -![interpreter_optimization](/img/hybridclr/interpreter_optimization.jpg) -以下是数值计算方面AOT与HybridCLR在优化后的性能对比,加法大约是7-16倍左右,乘法是4倍,除法是2倍。 +### 商业化版本性能/社区版本性能 (越大越好) + +商业化版本性能是社区版本的`2.87-7.35`倍。 + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) + +### 商业化版本性能/AOT版本性能 (越小越好) + +AOT版本性能是是商业化版本的`1.30 - 12.9`倍。 + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) -![benchmark_numeric](/img/hybridclr/benchmark_numeric.jpg) ## 原理 @@ -104,225 +109,779 @@ HybridCLR与il2cpp AOT部分交互极其轻量高效。不再有性能问题。 ## 附录:测试用例代码 -下面这些测试用例来自第三方提供,用例并不合理,但我们不想有刻意构造之嫌,直接引用它的用例。 - -```csharp -private static void Test0() -{ - var go = new GameObject("t"); - var transform = go.transform; - - var cnt = PerformanceSetting.Count * 1000; - for (var i = 0; i < cnt; i++) - { - transform.position = transform.position; - } - - Object.Destroy(go); -} - -private static void Test1() -{ - var go = new GameObject("t"); - var transform = go.transform; - - var cnt = PerformanceSetting.Count * 100; - for (var i = 0; i < cnt; i++) - { - transform.Rotate(Vector3.up, 1); - } - - Object.Destroy(go); -} - -private static void Test2() -{ - var cnt = PerformanceSetting.Count * 1000; - for (var i = 0; i < cnt; i++) - { - var v = new Vector3(i, i, i); - var x = v.x; - var y = v.y; - var z = v.z; - var r = x + y * z; - } -} -private static void Test3() -{ - var cnt = PerformanceSetting.Count * 10; - for (var i = 0; i < cnt; i++) - { - var go = new GameObject("t"); - Object.Destroy(go); - } -} -private static void Test4() -{ - var cnt = PerformanceSetting.Count * 10; - for (var i = 0; i < cnt; i++) - { - var go = new GameObject(); - go.AddComponent(); - var c = go.GetComponent(); - c.receiveShadows = false; - Object.Destroy(go); - } -} - -private static void Test5() -{ - var cnt = PerformanceSetting.Count * 1000; - for (var i = 0; i < cnt; i++) - { - var p = Input.mousePosition; - } -} - -private static void Test6() +```csharp +public class LongArithmetics { - var cnt = PerformanceSetting.Count * 1000; - for (var i = 0; i < cnt; i++) - { - var v = new Vector3(i, i, i); - Vector3.Normalize(v); - } -} - -private static void Test7() + [Benchmark] + [Params(1000000)] + public long add_1(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + + for(long i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long add_2(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + long e = 3; + + for (long i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long mul_1(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + + for (long i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long mul_2(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + long e = 3; + + for (long i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long div_1(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + + for (long i = 0; i < n; i++) + { + b = c / a; + c = d / a; + d = b / a; + + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } + + + public class IntArithmetics { - var cnt = PerformanceSetting.Count * 100; - for (var i = 0; i < cnt; i++) - { - var q1 = Quaternion.Euler(i, i, i); - var q2 = Quaternion.Euler(i * 2, i * 2, i * 2); - Quaternion.Slerp(Quaternion.identity, q1, 0.5f); - } + [Benchmark] + [Params(1000000)] + public int add_1(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + + for(int i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int add_2(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + int e = 3; + + for (int i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int mul_1(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + + for (int i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int mul_2(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + int e = 3; + + for (int i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int div_1(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + + for (int i = 0; i < n; i++) + { + b = c / a; + c = d / a; + d = b / a; + + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } } -private static void Test8() +public class FloatArithmetics { - double total = 0; - var cnt = PerformanceSetting.Count * 10000; - for (var i = 0; i < cnt; i++) - { - total = total + i - (i / 2) * (i + 3) / (i + 5); - } + [Benchmark] + [Params(1000000)] + public float add_1(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + + for(int i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float add_2(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + float e = 3; + + for (int i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float mul_1(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + + for (int i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float mul_2(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + float e = 3; + + for (int i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float div_1(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + + for (int i = 0; i < n; i++) + { + b = c / a; + c = d / a; + d = b / a; + + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } } -private static void Test9() +public class DoubleArithmetics { - var cnt = PerformanceSetting.Count * 1000; - for (var i = 0; i < cnt; i++) - { - var a = new Vector3(1, 2, 3); - var b = new Vector3(4, 5, 6); - var c = a + b; - } + [Benchmark] + [Params(1000000)] + public double add_1(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + + for (int i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double add_2(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + double e = 3; + + for (int i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double mul_1(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + + for (int i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double mul_2(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + double e = 3; + + for (int i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double div_1(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + + for (int i = 0; i < n; i++) + { + b = c / a; + c = d / a; + d = b / a; + + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } } -``` - -```lua -local function test0() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - - local go = CS.UnityEngine.GameObject("_") - local transform = go.transform - - for i = 1, cnt do - transform.position = transform.position - end - - CS.UnityEngine.GameObject.Destroy(go) -end - -local function test1() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 100 - - local go = CS.UnityEngine.GameObject("_") - local transform = go.transform - - for i = 1, cnt do - transform:Rotate(CS.UnityEngine.Vector3.up, 1) - end - - CS.UnityEngine.GameObject.Destroy(go) -end - -local function test2() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - - local go = CS.UnityEngine.GameObject("_") - local transform = go.transform - - for i = 1, cnt do - local tmp = CS.UnityEngine.Vector3(i, i, i) - local x = tmp.x - local y = tmp.y - local z = tmp.z - local r = x + y * z - end -end - -local function test3() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 10 - for i = 1, cnt do - local tmp = CS.UnityEngine.GameObject("___") - CS.UnityEngine.GameObject.Destroy(tmp) - end -end - -local function test4() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 10 - for i = 1, cnt do - local tmp = CS.UnityEngine.GameObject("___") - tmp:AddComponent(typeof(CS.UnityEngine.SkinnedMeshRenderer)) - local c = tmp:GetComponent(typeof(CS.UnityEngine.SkinnedMeshRenderer)) - c.receiveShadows = false - CS.UnityEngine.GameObject.Destroy(tmp) - end -end - -local function test5() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - for i = 1, cnt do - local tmp = CS.UnityEngine.Input.mousePosition; - end -end - -local function test6() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - for i = 1, cnt do - local tmp = CS.UnityEngine.Vector3(i, i, i) - CS.UnityEngine.Vector3.Normalize(tmp) - end -end - -local function test7() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 100 - for i = 1, cnt do - local t1 = CS.UnityEngine.Quaternion.Euler(i, i, i) - local t2 = CS.UnityEngine.Quaternion.Euler(i * 2, i * 2, i * 2) - CS.UnityEngine.Quaternion.Slerp(t1, t2, CS.UnityEngine.Random.Range(0.1, 0.9)) - end -end - -local function test8() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 10000 - local total = 0 - for i = 1, cnt do - total = total + i - (i / 2) * (i + 3) / (i + 5) - end -end - -local function test9() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - for i = 1, cnt do - local tmp0 = CS.UnityEngine.Vector3(1, 2, 3) - local tmp1 = CS.UnityEngine.Vector3(4, 5, 6) - local tmp2 = tmp0 + tmp1 - end -end ``` \ No newline at end of file diff --git a/docs/business/advancedcodeoptimization.md b/docs/business/advancedoptimization.md similarity index 61% rename from docs/business/advancedcodeoptimization.md rename to docs/business/advancedoptimization.md index b5cf36996..b689252d9 100644 --- a/docs/business/advancedcodeoptimization.md +++ b/docs/business/advancedoptimization.md @@ -1,16 +1,21 @@ -# 高级指令优化 +# 离线指令优化 :::warning -高级指令优化尚在开发中。 + +离线指令优化技术正开发中,目前仅可使用标准指令优化技术。 ::: -高级指令优化技术是独立于标准指令优化技术的实现。高级指令优化技术使用更丰富的编译优化技术,极大提升了解释模块的性能。 +离线指令优化(Offline Instruction Optimization,简称OIO)离线将原始IL指令转换为自定义的寄存器指令。 +由于离线没有编译性能限制,可以使用更丰富的编译优化技术,极大提升了解释模块的性能。 + 优化后的指令执行性能整体提升100%-1000%(没看错,10倍以上)甚至更高,尤其是数值指令整体提升近300%。 而且由于已经提前转换,加载和指令翻译过程更快,卡顿更小。 +离线指令优化技术支持代码加固方案中的虚拟化技术,极大提升了代码安全。 + ## 实现 -高级指令优化技术包含了以下优化技术: +离线指令优化技术包含了以下优化技术: - 彻底的无用栈指令消除。消除掉所有不必要的栈操作 - 窥孔优化 @@ -28,4 +33,3 @@ ## 性能 TODO。 - diff --git a/docs/business/basiccodeoptimization.md b/docs/business/basiccodeoptimization.md deleted file mode 100644 index ed81b8889..000000000 --- a/docs/business/basiccodeoptimization.md +++ /dev/null @@ -1,50 +0,0 @@ -# 离线指令优化 - -:::warning - -离线指令优化技术正开发中,目前仅可使用旧版本指令优化技术(即原来的标准指令优化技术)。 -::: - -离线指令优化(Offline Instruction Optimization,简称OIO)离线将原始IL指令转换为自定义的寄存器指令。 -由于离线没有编译性能限制,可以使用更丰富的编译优化技术,极大提升了解释模块的性能。 - -优化后的指令执行性能整体提升100%-1000%(没看错,10倍以上)甚至更高,尤其是数值指令整体提升近300%。 -而且由于已经提前转换,加载和指令翻译过程更快,卡顿更小。 - -离线指令优化技术支持代码加固方案中的虚拟化技术,极大提升了代码安全。 - -## 实现 - -高级指令优化技术包含了以下优化技术: - -- 彻底的无用栈指令消除。消除掉所有不必要的栈操作 -- 窥孔优化 -- 常量复制优化 -- 局部复制传播优化 -- 全局复制传播优化 -- 解释函数inline -- AOT函数inline(专利技术) -- 提供更多instinct指令,大幅提升常见的指令组合性能 -- 条件检查消除技术。消除不必要的空指针检查、类型强转检查、数组越界检查 -- CheckOnce运行时检查动态消除优化。例如访问静态成员变量的指令,在第2次执行时不再检查类型是否已经初始化过 -- 其他优化 - - -## 性能 - -TODO。 - -## 标准指令优化技术(将废弃) - -标准指令优化技术为运行时优化技术。对常见的代码范式进行谨慎可靠的优化,大幅提升了变量访问(50%-100%)、数值计算(100-300%)、对象访问(50-200%)等常见指令的性能,像一些特殊代码如typeof指令的性能,提升了1000%以上。 - -由于运行时的时间及内存限制,标准指令优化只做一些无用栈消除、窥孔优化等等简单但较可靠的优化,无法执行一些复杂的优化。但由于IL指令是栈指令,即使只做了一些不复杂的常见优化,性能相比于社区的未优化版本也有显著提升。 - - -以下是标准指令优化相比于社区版本的解释性能提升数据(0表示性能持平,n表示提升n倍)。 - -![interpreter_optimization](/img/hybridclr/interpreter_optimization.jpg) - -以下是数值计算方面原生与标准指令优化的性能对比,纵坐标为耗时。标准指令优化的加法大约为原生7-16倍左右,乘法是4倍,除法是2倍。 - -![benchmark_numeric](/img/hybridclr/benchmark_numeric.jpg) diff --git a/docs/business/basicoptimization.md b/docs/business/basicoptimization.md new file mode 100644 index 000000000..00ab8ac85 --- /dev/null +++ b/docs/business/basicoptimization.md @@ -0,0 +1,34 @@ +# 解释执行优化 + +目前仅可使用标准解释执行优化技术,[离线指令优化](./advancedoptimization)技术还在开发中。 + +## 标准指令优化技术 + +标准指令优化技术为运行时优化技术。对常见的代码范式进行谨慎可靠的优化,大幅提升了变量访问、数值计算、对象访问等常见指令的性能,像一些特殊代码如typeof指令的性能,提升了1000%以上。 + +**商业版**大幅提升了数值计算性能,其性能是社区版本的 **280%-735%**。 + +以下是OnePlus 9R ArmV8 实机测试报告,测试代码附录最后。 + +### AOT耗时 vs 商业化版本耗时 vs 社区版本耗时 (越小越好) + +![data](/img/benchmark/numeric_datas.jpg) + +### 商业化版本耗时/AOT耗时 vs 社区版本耗时/AOT耗时 (越小越好) + +AOT版本性能是社区版本的`4.1 - 90`倍,是商业化版本的`1.30 - 12.9`倍。 + +![data](/img/benchmark/numeric_business_vs_aot_div_aot.jpg) + + +### 商业化版本性能/社区版本性能 (越大越好) + +商业化版本性能是社区版本的`2.87-7.35`倍。 + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) + +### 商业化版本性能/AOT版本性能 (越小越好) + +AOT版本性能是是商业化版本的`1.30 - 12.9`倍。 + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) diff --git a/docs/business/intro.md b/docs/business/intro.md index c0ebc3cbf..22dd17ab1 100644 --- a/docs/business/intro.md +++ b/docs/business/intro.md @@ -12,7 +12,7 @@ 目前有三个商业化版本,具体特性对比如下: -- [专业版](./pro/intro.md)。优化了性能、内存,提供更高的代码安全 +- [专业版](./pro/intro.md)。显著提升了[解释执行性能](./basicoptimization)(数值指令性能是社区版本的**280%-735%**)、优化了元数据内存,支持代码加密,有效保障了代码安全 - [**旗舰版**](./ultimate/intro.md)。包含专业版的所有功能,另外包含了我们最核心的[DHE技术](./differentialhybridexecution),极大提升了性能,几乎(**未改动时为100%**)达到同等的原生AOT水平 - [热重载版](./reload/intro.md)。包含专业版的所有功能,同时支持卸载和重新加载单独的assembly,当前可以卸载掉程序集**100%**的元数据内存 @@ -27,7 +27,7 @@ |[完全泛型共享](./fullgenericsharing)||✔|✔|✔| |[DOTS](./dots)||✔|✔|✔| |[元数据优化](./metadataoptimization.md)||✔|✔|✔| -|[离线指令优化](./basiccodeoptimization)||✔|✔|✔| +|[解释性能优化](./basicoptimization)||✔|✔|✔| |[代码加固](./basicencryption)||✔|✔|✔| |[热重载](./reload/hotreloadassembly)||||✔| |[访问控制机制](./accesspolicy)||||✔| diff --git a/i18n/en/docusaurus-plugin-content-docs/current/basic/performance.md b/i18n/en/docusaurus-plugin-content-docs/current/basic/performance.md index eba1a77cf..db85de5e6 100644 --- a/i18n/en/docusaurus-plugin-content-docs/current/basic/performance.md +++ b/i18n/en/docusaurus-plugin-content-docs/current/basic/performance.md @@ -1,328 +1,887 @@ # Performance -Although HybridCLR is also interpreted and executed, both theoretical principles and real machine test data show that HybridCLR has greatly improved performance (several times or even dozens of times) compared to the current popular hot update solutions such as lua and ILRuntime. +Although HybridCLR is also interpreted and executed, both theoretical principles and real machine test data show that HybridCLR has greatly improved performance (several times or even dozens of times) compared to the currently popular hot update solutions such as Lua and ILRuntime. ## Benchmark -The community version of HybridCLR is much (several times to tens of times) better than the Lua solution except that its numerical calculation is equal to that of Lua. +Except for the obvious difference between **numerical calculation** and AOT, the community version of HybridCLR has little difference in other aspects. Therefore, for most projects, the overall performance of the game is not much different from that of the fully native version. -The HybridCLR of the **commercial version** has greatly optimized the performance of numerical calculations, with a performance improvement of nearly 300%. Most other common instructions also have a performance improvement of 50%-200%. Developers who have strict requirements on performance can contact us [Business Services](../business/intro.md). +**Commercial version** of HybridCLR has greatly optimized numerical calculation performance, which is 170%-740% of the community version. Developers with strict performance requirements can contact us [Commercial Service](../business/intro.md ). -The following is the test report of the community version of HybridCLR under the iphone 11 and Xiaomi 5C mobile phones. The test code is at the end of the appendix. +The following is the OnePlus 9R ArmV8 actual test report, with the test code appendix at the end. -:::caution -Note that the test cases of test2, test8, and test9 are extremely unreasonable. In the case of AOT, they will be completely optimized by the compiler, resulting in a time of 0. real gap -It should be between 10-30 times. -::: +### AOT time-consuming vs. commercial version time-consuming vs. community version time-consuming (the smaller the better) -AOT lines are native il2cpp data. The HotFix line is HybridCLR data. Lua lines are data for xlua. +![data](/img/benchmark/numeric_datas.jpg) -![iphone11](/img/hybridclr/benchmark_iphone11.png) +### Commercial version time-consuming/AOT time-consuming vs community version time-consuming/AOT time-consuming (the smaller the better) -![xiaomi5c](/img/hybridclr/benchmark_xiaomi.png) +The performance of the AOT version is `4.1 - 90` times that of the community version and `1.30 - 12.9` times that of the commercial version. -The following is the performance improvement data of the commercial version under some test cases compared with the community version. +![data](/img/benchmark/numeric_business_vs_aot_div_aot.jpg) -![interpreter_optimization](/img/hybridclr/interpreter_optimization.jpg) -The following is the performance comparison between AOT and HybridCLR after optimization in terms of numerical calculation. The addition is about 7-16 times, the multiplication is 4 times, and the division is 2 times. +### Commercial version performance/community version performance (the bigger, the better) + +The performance of the commercial version is `2.87-7.35` times that of the community version. + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) + +### Commercial version performance/AOT version performance (the smaller, the better) + +The performance of the AOT version is `1.30 - 12.9` times that of the commercial version. + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) -![benchmark_numeric](/img/hybridclr/benchmark_numeric.jpg) ## Principle -Since HybridCLR is implemented in C++ and seamlessly integrated directly with il2cpp runtime, it can directly access the underlying data and various interfaces of runtime. Compared with ILRuntime and Xlua, the additional cost of the C# layer is saved, and the interaction cost is greatly reduced. +Since HybridCLR is implemented in C++ and directly integrated seamlessly with the il2cpp runtime, it can directly access the underlying data and various interfaces of the runtime. Compared with ILRuntime and Xlua, the additional cost of the C# layer is eliminated, and the interaction cost is greatly reduced. The excellent performance of HybridCLR mainly comes from the following aspects: ### Rewritten streamlined and efficient metadata parsing library -We did not use the ready-made metadata parsing library, and implemented a C++ version of streamlined and efficient metadata according to the requirements of HybridCLR -parsing library. Other C# hot update or hotfix solutions use C# libraries such as Cecil, and there is a huge gap in memory and loading efficiency! +We did not use the ready-made metadata parsing library and implemented a C++ version of streamlined and efficient metadata according to HybridCLR requirements. +Parsing library. Other C# hot update or hotfix solutions all use C# libraries such as Cecil, and there is a huge gap in memory and loading efficiency! -### Use the register instruction set +### Use register instruction set -The original IL bytecode is a stack-based instruction set, and HybridCLR converts it into a register instruction set, which reduces the stack maintenance overhead. +The original IL bytecode is a stack-based instruction set, and HybridCLR converts it into a register instruction set, reducing stack maintenance overhead. ### Direct access to data stack and execution stack -Stack operations are the most common operations in the CLI, and almost all instructions involve stack operations. Since the interpreter stack is the heap memory maintained by itself, the CLI has restrictions on the pointer operation of the struct. If the interpreter is implemented in C#, -You cannot directly manipulate data types on the interpreter stack, and you have to use various tricks to achieve this purpose indirectly. The HybridCLR is implemented in C++ and can be directly manipulated. +Stack operation is the most common operation in CLI, and almost all instructions involve stack operation. Since the interpreter stack is a heap memory maintained by itself, the CLI has restrictions on struct pointer operations. If you use C# to implement the interpreter, +Then you cannot directly operate the data type on the interpreter stack, and you have to use various techniques to achieve this purpose indirectly. HybridCLR is implemented in C++ and can be operated directly. -The efficiency of manipulating struct types is several times to dozens of times higher than other interpreters. +The efficiency of operating struct types is improved several times to dozens of times compared with other interpreters. -### Directive static specialization +### Static specialization of directives -Some instructions such as the `add` instruction are multi-function instructions, and the final operation is determined according to the type of the operand on the current stack. HybridCLR designed four instructions `add_i4, add_i8, add_r4, add_r8` for it, when translating instructions -Calculate the data type of the current stack and translate it into the corresponding specialization instruction. It saves the overhead of judging the type at runtime, and also saves the overhead of maintaining data types at runtime. +Some instructions such as the `add` instruction are multi-functional instructions that determine the final operation based on the type of operand currently on the stack. HybridCLR has designed 4 instructions for it: `add_i4, add_i8, add_r4, add_r8`. When translating the instructions +Calculate the data type of the current stack and translate it into the corresponding specialization instruction. It saves the overhead of determining types at runtime and the overhead of maintaining data types at runtime. ### Calculate the runtime metadata that needs to be resolved in advance -Some instructions such as ldtoken and ldstr need to convert the data in the instruction into actual runtime data at runtime. HybridCLR directly calculates the corresponding runtime data during translation and saves it in the converted instruction. -greatly improved performance +Some instructions such as ldtoken and ldstr require the data in the instruction to be converted into actual runtime data at runtime. HybridCLR directly calculates the corresponding runtime data during translation and saves it to the converted instructions. +Greatly improved performance -### Object member access instructions are simple and efficient +### Implementation of object member access instructions is simple and efficient -Object member access instructions like `v.x = b;` are very common. Due to the limitations of the C# language, like ILRuntime and xlua, they have to be operated through a wrap function call. Since HybridCLR is implemented in C++, it can directly access -For the memory data of the object, by calculating the offset of the field in the object in advance, directly `*(int32_t*)(obj + offset) = b;` can complete this access operation. +Object member access instructions like `v.x = b;` are very common. ILRuntime and xlua have to operate through a wrap function call due to C# language limitations. Since HybridCLR is implemented in C++, it can directly access +For the memory data of the object, by calculating the offset of the field in the object in advance, this access operation can be completed directly by `*(int32_t*)(obj + offset) = b;`. -Compared with other hot update schemes, the efficiency is improved by dozens of times. +Compared with other hot update solutions, the efficiency is improved dozens of times. -### Directly support reference and pointer operations without indirect methods +### Directly supports reference and pointer operations without indirect methods -Due to the specification restrictions of the CLI, references in C# can only be placed on the managed stack, but not on the interpreter stack (because it is heap memory). To handle something like `ref int a = ref b; a = 5;`, you have to use very complicated -The trick maintains this reference indirectly. And HybridCLR uses c++ to realize, can save and operate these data directly. +Due to CLI specification restrictions, references in C# can only be placed on the managed stack and not on the interpreter stack (because it is heap memory). In order to handle code like `ref int a = ref b; a = 5;` one has to use very complex +The trick is to maintain this reference indirectly. HybridCLR is implemented in c++ and can directly save and operate these data. Compared with other hot update solutions, the efficiency is greatly improved. -### Unified metadata, more efficient object creation, and smaller memory footprint +### Metadata is unified, object creation is more efficient, and memory usage is smaller. -Due to the unified metadata, you can directly call il2cpp::vm::Object::New to create objects, the efficiency is very close to the original, and the memory is exactly the same. In contrast, other hot update schemes use fake types, -The object is bloated, and the process of creating the object is more complicated. +Due to the unified metadata, you can directly call il2cpp::vm::Object::New to create objects. The efficiency is very close to the native one, and the memory is exactly the same. In contrast, other hot update schemes use fake types, +Objects are bloated and the process of creating objects is more complex. -Compared with other hot update schemes, the efficiency is greatly improved. +Compared with other hot update solutions, the efficiency is greatly improved. -### The metadata is unified, the function calling method is unified, and there is no additional overhead of PInvoke and ReservePInvoke +### Unified metadata, unified function calling methods, and no additional overhead of PInvoke and ReservePInvoke -HybridCLR can directly call the C++ function translated by the IL function without any intermediate links, while ILRuntime and xlua require various complex judgments and parameter conversions, as well as PInvoke and ReservePInvoke between C# and bring a lot of extra overhead. +HybridCLR can directly call C++ functions translated by IL functions without any intermediate links, while ILRuntime and xlua require various complex determinations and parameter conversions, as well as PInvoke and ReservePInvoke with C#, which brings a lot of additional overhead. The interaction between HybridCLR and il2cpp AOT is extremely lightweight and efficient. No more performance issues. -### additionally provide a large number of instinct functions +### Provides a large number of additional instinct functions -For common operations such as `new Vector{2,3,4}`, `new string()`, `Nullable.Value`, etc., we directly provide the corresponding instructions, and the running overhead is even lower than the implementation of AOT . +For common operations like `new Vector{2,3,4}`, `new string()`, `Nullable.Value`, etc., we directly provide corresponding instructions, and the running overhead is even lower than the AOT implementation. . -Compared with other hot update schemes, the efficiency is improved by dozens of times. +Compared with other hot update solutions, the efficiency is improved dozens of times. -### Strictly follow the specification and do not introduce additional unnecessary costs +### Strictly follow the specifications and do not introduce additional unnecessary costs -Due to careful design and optimization, HybridCLR tries to avoid all kinds of unnecessary overhead. For example, the GC of the execution process is exactly the same as that of native il2cpp and mono. +Due to careful design and optimization, HybridCLR tries to avoid unnecessary overhead. For example, the GC of the execution process is exactly the same as native il2cpp and mono. ### Other instruction optimization techniques -Other Optimization Techniques +Other optimization techniques ## Appendix: Test case code -The following test cases are provided by a third party. The use cases are unreasonable, but we don’t want to be deliberately constructed and directly quote their use cases. - -```csharp -private static void Test0() -{ - var go = new GameObject("t"); - var transform = go. transform; - - var cnt = PerformanceSetting. Count * 1000; - for (var i = 0; i < cnt; i++) - { - transform.position = transform.position; - } - - Object. Destroy(go); -} - -private static void Test1() -{ - var go = new GameObject("t"); - var transform = go. transform; - - var cnt = PerformanceSetting. Count * 100; - for (var i = 0; i < cnt; i++) - { - transform. Rotate(Vector3. up, 1); - } - Object. Destroy(go); -} -private static void Test2() -{ - var cnt = PerformanceSetting. Count * 1000; - for (var i = 0; i < cnt; i++) - { - var v = new Vector3(i, i, i); - var x = v.x; - var y = v.y; - var z = v.z; - var r = x + y * z; - } -} - -private static void Test3() -{ - var cnt = PerformanceSetting. Count * 10; - for (var i = 0; i < cnt; i++) - { - var go = new GameObject("t"); - Object. Destroy(go); - } -} - -private static void Test4() -{ - var cnt = PerformanceSetting. Count * 10; - for (var i = 0; i < cnt; i++) - { - var go = new GameObject(); - go.AddComponent(); - var c = go. GetComponent(); - c. receiveShadows = false; - Object. Destroy(go); - } -} - -private static void Test5() +```csharp +public class LongArithmetics { - var cnt = PerformanceSetting. Count * 1000; - for (var i = 0; i < cnt; i++) - { - var p = Input. mousePosition; - } -} - -private static void Test6() + [Benchmark] + [Params(1000000)] + public long add_1(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + + for(long i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long add_2(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + long e = 3; + + for (long i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long mul_1(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + + for (long i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long mul_2(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + long e = 3; + + for (long i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public long div_1(long n) + { + long a = 1; + long b = n; + long c = 2; + long d = n; + + for (long i = 0; i < n; i++) + { + b = c / a; + c = d/a; + d = b / a; + + b = c / a; + c = d/a; + d = b / a; + b = c / a; + c = d/a; + d = b / a; + b = c / a; + c = d/a; + d = b / a; + b = c / a; + c = d/a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d/a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } + + + public class IntArithmetics { - var cnt = PerformanceSetting. Count * 1000; - for (var i = 0; i < cnt; i++) - { - var v = new Vector3(i, i, i); - Vector3. Normalize(v); - } + [Benchmark] + [Params(1000000)] + public int add_1(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + + for(int i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int add_2(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + int e = 3; + + for (int i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int mul_1(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + + for (int i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int mul_2(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + int e = 3; + + for (int i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public int div_1(int n) + { + int a = 1; + int b = n; + int c = 2; + int d = n; + + for (int i = 0; i < n; i++) + { + b = c / a; + c = d / a; + d = b / a; + + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } } -private static void Test7() +public class FloatArithmetics { - var cnt = PerformanceSetting. Count * 100; - for (var i = 0; i < cnt; i++) - { - var q1 = Quaternion. Euler(i, i, i); - var q2 = Quaternion. Euler(i * 2, i * 2, i * 2); - Quaternion. Slerp(Quaternion. identity, q1, 0.5f); - } + [Benchmark] + [Params(1000000)] + public float add_1(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + + for(int i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float add_2(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + float e = 3; + + for (int i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float mul_1(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + + for (int i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float mul_2(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + float e = 3; + + for (int i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public float div_1(int n) + { + float a = 1; + float b = n; + float c = 2; + float d = n; + + for (int i = 0; i < n; i++) + { + b = c / a; + c = d / a; + d = b / a; + + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } } -private static void Test8() +public class DoubleArithmetics { - double total = 0; - var cnt = PerformanceSetting. Count * 10000; - for (var i = 0; i < cnt; i++) - { - total = total + i - (i / 2) * (i + 3) / (i + 5); - } + [Benchmark] + [Params(1000000)] + public double add_1(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + + for (int i = 0; i < n; i++) + { + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + a = b + c; + b = c + d; + c = d + a; + d = a + b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double add_2(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + double e = 3; + + for (int i = 0; i < n; i++) + { + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + a = b + c + d + e; + b = c + d + e + a; + c = d + e + a + b; + d = e + a + b + c; + e = a + b + c + d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double mul_1(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + + for (int i = 0; i < n; i++) + { + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + a = b * c; + b = c * d; + c = d * a; + d = a * b; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double mul_2(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + double e = 3; + + for (int i = 0; i < n; i++) + { + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + a = b * c * d * e; + b = c * d * e * a; + c = d * e * a * b; + d = e * a * b * c; + e = a * b * c * d; + } + return a + b + c + d; + } + + + [Benchmark] + [Params(1000000)] + public double div_1(int n) + { + double a = 1; + double b = n; + double c = 2; + double d = n; + + for (int i = 0; i < n; i++) + { + b = c / a; + c = d / a; + d = b / a; + + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + b = c / a; + c = d / a; + d = b / a; + a = a / n + 1; + } + return a + b + c + d; + } } -private static void Test9() -{ - var cnt = PerformanceSetting. Count * 1000; - for (var i = 0; i < cnt; i++) - { - var a = new Vector3(1, 2, 3); - var b = new Vector3(4, 5, 6); - var c = a + b; - } -} ``` - -```lua -local function test0() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - - local go = CS.UnityEngine.GameObject("_") - local transform = go.transform - - for i = 1, cnt do - transform.position = transform.position - end - - CS.UnityEngine.GameObject.Destroy(go) -end - -local function test1() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 100 - - local go = CS.UnityEngine.GameObject("_") - local transform = go.transform - - for i = 1, cnt do - transform:Rotate(CS.UnityEngine.Vector3.up, 1) - end - - CS.UnityEngine.GameObject.Destroy(go) -end - -local function test2() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - - local go = CS.UnityEngine.GameObject("_") - local transform = go.transform - - for i = 1, cnt do - local tmp = CS.UnityEngine.Vector3(i, i, i) - local x = tmp.x - local y = tmp.y - local z = tmp.z - local r = x + y * z - end -end - -local function test3() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 10 - for i = 1, cnt do - local tmp = CS.UnityEngine.GameObject("___") - CS.UnityEngine.GameObject.Destroy(tmp) - end -end - -local function test4() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 10 - for i = 1, cnt do - local tmp = CS.UnityEngine.GameObject("___") - tmp:AddComponent(typeof(CS.UnityEngine.SkinnedMeshRenderer)) - local c = tmp:GetComponent(typeof(CS.UnityEngine.SkinnedMeshRenderer)) - c.receiveShadows = false - CS.UnityEngine.GameObject.Destroy(tmp) - end -end - -local function test5() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - for i = 1, cnt do - local tmp = CS.UnityEngine.Input.mousePosition; - end -end - -local function test6() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - for i = 1, cnt do - local tmp = CS.UnityEngine.Vector3(i, i, i) - CS.UnityEngine.Vector3.Normalize(tmp) - end -end - -local function test7() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 100 - for i = 1, cnt do - local t1 = CS.UnityEngine.Quaternion.Euler(i, i, i) - local t2 = CS.UnityEngine.Quaternion.Euler(i * 2, i * 2, i * 2) - CS.UnityEngine.Quaternion.Slerp(t1, t2, CS.UnityEngine.Random.Range(0.1, 0.9)) - end -end - -local function test8() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 10000 - local total = 0 - for i = 1, cnt do - total = total + i - (i / 2) * (i + 3) / (i + 5) - end -end - -local function test9() - local cnt = CS.GameMain.Scripts.Performance.PerformanceSetting.Count * 1000 - for i = 1, cnt do - local tmp0 = CS.UnityEngine.Vector3(1, 2, 3) - local tmp1 = CS.UnityEngine.Vector3(4, 5, 6) - local tmp2 = tmp0 + tmp1 - end -end - -``` \ No newline at end of file diff --git a/i18n/en/docusaurus-plugin-content-docs/current/business/advancedcodeoptimization.md b/i18n/en/docusaurus-plugin-content-docs/current/business/advancedoptimization.md similarity index 60% rename from i18n/en/docusaurus-plugin-content-docs/current/business/advancedcodeoptimization.md rename to i18n/en/docusaurus-plugin-content-docs/current/business/advancedoptimization.md index 2357c3551..c657d6bbb 100644 --- a/i18n/en/docusaurus-plugin-content-docs/current/business/advancedcodeoptimization.md +++ b/i18n/en/docusaurus-plugin-content-docs/current/business/advancedoptimization.md @@ -1,16 +1,21 @@ -# Advanced instruction optimization +# Offline Instruction Optimization :::warning -Advanced instruction optimizations are still under development. + +Offline instruction optimization technology is under development, and currently only standard instruction optimization technology can be used. ::: -Advanced instruction optimization technology is an implementation independent of standard instruction optimization technology. Advanced instruction optimization technology uses richer compilation optimization technology to greatly improve the performance of the interpretation module. +Offline Instruction Optimization (OIO) converts original IL instructions into customized register instructions offline. +Since there is no compilation performance limit offline, richer compilation optimization technologies can be used, which greatly improves the performance of the interpretation module. + After optimization, the overall instruction execution performance is improved by 100%-1000% (yes, more than 10 times) or even higher, especially the overall improvement of numerical instructions by nearly 300%. And because it has been converted in advance, the loading and instruction translation process is faster and the lag is smaller. -## accomplish +Offline instruction optimization technology supports virtualization technology in code reinforcement solutions, greatly improving code security. + +## Techonology Detail -Advanced instruction optimization technology includes the following optimization technologies: +Offline instruction optimization technology includes the following optimization technologies: - Complete elimination of useless stack instructions. Eliminate all unnecessary stack operations - Peephole optimization @@ -27,4 +32,4 @@ Advanced instruction optimization technology includes the following optimization ## Performance -TODO. \ No newline at end of file +TODO. diff --git a/i18n/en/docusaurus-plugin-content-docs/current/business/basiccodeoptimization.md b/i18n/en/docusaurus-plugin-content-docs/current/business/basiccodeoptimization.md deleted file mode 100644 index 7ac3be73a..000000000 --- a/i18n/en/docusaurus-plugin-content-docs/current/business/basiccodeoptimization.md +++ /dev/null @@ -1,50 +0,0 @@ -# Offline Instruction Optimization - -:::warning - -The offline instruction optimization technology is under development, and currently only the old version of the instruction optimization technology (i.e. the original standard instruction optimization technology) can be used. -::: - -Offline Instruction Optimization (OIO) converts original IL instructions into customized register instructions offline. -Since there is no compilation performance limit offline, richer compilation optimization technologies can be used, which greatly improves the performance of the interpretation module. - -After optimization, the overall instruction execution performance is improved by 100%-1000% (yes, more than 10 times) or even higher, especially the overall improvement of numerical instructions by nearly 300%. -And because it has been converted in advance, the loading and instruction translation process is faster and the lag is smaller. - -Offline instruction optimization technology supports virtualization technology in code reinforcement solutions, greatly improving code security. - -## accomplish - -Advanced instruction optimization technology includes the following optimization technologies: - -- Complete elimination of useless stack instructions. Eliminate all unnecessary stack operations -- Peephole optimization -- Constant copy optimization -- Optimization of local copy propagation -- Global copy propagation optimization -- Explain function inline -- AOT function inline (patented technology) -- Provide more instinct instructions to greatly improve the performance of common instruction combinations -- Conditional check elimination technology. Eliminate unnecessary null pointer checks, type cast checks, and array out-of-bounds checks -- CheckOnce runtime checks dynamically eliminate optimizations. For example, an instruction that accesses a static member variable will no longer check whether the type has been initialized during the second execution. -- Other optimizations - - -## Performance - -TODO. - -## Standard instruction optimization technology (will be obsolete) - -Standard instruction optimization techniques are runtime optimization techniques. Careful and reliable optimization of common code paradigms has greatly improved the performance of common instructions such as variable access (50%-100%), numerical calculations (100-300%), object access (50-200%), etc., like some special The performance of codes such as typeof instructions has been improved by more than 1000%. - -Due to runtime time and memory limitations, standard instruction optimization only performs simple but reliable optimizations such as useless stack elimination and peephole optimization, and cannot perform some complex optimizations. However, since the IL instruction is a stack instruction, even if only some uncomplicated common optimizations are made, the performance is significantly improved compared to the community's unoptimized version. - - -The following is the performance improvement data of the standard instruction optimization compared to the community version (0 means the performance is the same, n means n times improvement). - -![interpreter_optimization](/img/hybridclr/interpreter_optimization.jpg) - -The following is a performance comparison between native and standard instruction optimization in terms of numerical calculations. The ordinate is time consumption. The standard instruction optimized addition is about 7-16 times that of the native one, the multiplication is 4 times, and the division is 2 times. - -![benchmark_numeric](/img/hybridclr/benchmark_numeric.jpg) diff --git a/i18n/en/docusaurus-plugin-content-docs/current/business/basicoptimization.md b/i18n/en/docusaurus-plugin-content-docs/current/business/basicoptimization.md new file mode 100644 index 000000000..a669dfc6f --- /dev/null +++ b/i18n/en/docusaurus-plugin-content-docs/current/business/basicoptimization.md @@ -0,0 +1,34 @@ +# Execution Optimization + +Currently, only standard interpretation execution optimization technology can be used, and the [Offline Instruction Optimization](./advancedoptimization) technology is still under development. + +## Standard Instruction Optimization + +Standard instruction optimization techniques are runtime optimization techniques. Careful and reliable optimization of common code paradigms has greatly improved the performance of common instructions such as variable access, numerical calculations, and object access. The performance of some special codes such as the typeof instruction has been improved by more than 1000%. + +The **Business Edition** greatly improves the numerical calculation performance, and its performance is **280%-735%** of the Community Edition. + +The following is the OnePlus 9R ArmV8 actual test report, with the test code appendix at the end. + +### AOT time-consuming vs. Business Edition time-consuming vs. Community Edition time-consuming (the smaller the better) + +![data](/img/benchmark/numeric_datas.jpg) + +### Business Edition time-consuming/AOT time-consuming vs Community Edition time-consuming/AOT time-consuming (the smaller the better) + +The performance of the AOT version is `4.1 - 90` times that of the Community Edition and `1.30 - 12.9` times that of the Business Edition. + +![data](/img/benchmark/numeric_business_vs_aot_div_aot.jpg) + + +### Business Edition performance/Community Edition performance (the bigger, the better) + +The performance of the Business Edition is `2.87-7.35` times that of the Community Edition. + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) + +### Business Edition performance/AOT version performance (the smaller, the better) + +The performance of the AOT version is `1.30 - 12.9` times that of the Business Edition. + +![data](/img/benchmark/numeric_dialog_business_div_community.jpg) diff --git a/i18n/en/docusaurus-plugin-content-docs/current/business/intro.md b/i18n/en/docusaurus-plugin-content-docs/current/business/intro.md index 78f15549a..6f5467457 100644 --- a/i18n/en/docusaurus-plugin-content-docs/current/business/intro.md +++ b/i18n/en/docusaurus-plugin-content-docs/current/business/intro.md @@ -27,11 +27,11 @@ There are currently three commercial versions. The specific features are compare |[Full Generic Sharing](./fullgenericsharing)||✔|✔|✔| |[DOTS](./dots)||✔|✔|✔| |[Metadata Optimization](./metadataoptimization.md)||✔|✔|✔| -|[Offline Instruction Optimization](./basiccodeoptimization)||✔|✔|✔| +|[Offline Instruction Optimization](./basicoptimization)||✔|✔|✔| |[Code Protection](./basicencryption)||✔|✔|✔| |[Hot reload](./reload/hotreloadassembly)||||✔| |[Access Control Policy](./accesspolicy)||||✔| -|[Advanced code optimization (experimental)](./advancedcodeoptimization)|||✔|| +|[Advanced code optimization (experimental)](./advancedoptimization)|||✔|| |[**DHE Technology**](./differentialhybridexecution)|||✔|| |Technical support||1 year|2 years|2 years| diff --git a/sidebars.js b/sidebars.js index b4983d862..dfc990843 100644 --- a/sidebars.js +++ b/sidebars.js @@ -86,7 +86,8 @@ const sidebars = { 'business/fullgenericsharing', 'business/dots', 'business/metadataoptimization', - 'business/basiccodeoptimization', + 'business/basicoptimization', + 'business/advancedoptimization', 'business/basicencryption', 'business/accesspolicy', 'business/businesscase', diff --git a/static/img/benchmark/numeric_business_vs_aot_div_aot.jpg b/static/img/benchmark/numeric_business_vs_aot_div_aot.jpg new file mode 100644 index 000000000..d287107c4 Binary files /dev/null and b/static/img/benchmark/numeric_business_vs_aot_div_aot.jpg differ diff --git a/static/img/benchmark/numeric_datas.jpg b/static/img/benchmark/numeric_datas.jpg new file mode 100644 index 000000000..e7964b544 Binary files /dev/null and b/static/img/benchmark/numeric_datas.jpg differ diff --git a/static/img/benchmark/numeric_dialog_business_div_aot.jpg b/static/img/benchmark/numeric_dialog_business_div_aot.jpg new file mode 100644 index 000000000..df478b69c Binary files /dev/null and b/static/img/benchmark/numeric_dialog_business_div_aot.jpg differ diff --git a/static/img/benchmark/numeric_dialog_business_div_community.jpg b/static/img/benchmark/numeric_dialog_business_div_community.jpg new file mode 100644 index 000000000..84e92d3b6 Binary files /dev/null and b/static/img/benchmark/numeric_dialog_business_div_community.jpg differ diff --git a/static/img/benchmark/numeric_dialog_costtime.jpg b/static/img/benchmark/numeric_dialog_costtime.jpg new file mode 100644 index 000000000..1e0b61999 Binary files /dev/null and b/static/img/benchmark/numeric_dialog_costtime.jpg differ