-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Perf -1,517%] System.Numerics.Tests.Perf_Vector2.DistanceBenchmark #50939
Comments
Tagging subscribers to this area: @tannergooding, @pgovind Issue DetailsRun Information
Regressions in System.Numerics.Tests.Perf_Vector2
Reprogit clone /~https://github.com/dotnet/performance.git
python3 .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Numerics.Tests.Perf_Vector2*' HistogramSystem.Numerics.Tests.Perf_Vector2.DistanceBenchmark
DocsProfiling workflow for dotnet/runtime repository
|
@DrewScoggins, do we have disassembly easily accessible? |
Thanks to @adamsitnik for noticing this. Look related to this PR #41898 |
No, because this is so old all the artifacts, if we had them, would have been purged. And this was before we added disassembly to the report. |
in this case we are lucky and the regression is reproducible on Windows. The disassembly: git clone /~https://github.com/dotnet/performance.git
py .\performance\scripts\benchmarks_ci.py -f net5.0 net6.0 --filter System.Numerics.Tests.Perf_Vector2.DistanceBenchmark --bdn-arguments "--disasm true" .NET 5.0.5 (5.0.521.16609), X64 RyuJIT; System.Numerics.Tests.Perf_Vector2.DistanceBenchmark()
vzeroupper
mov rax,16AD17D7C78
mov rax,[rax]
vmovsd xmm0,qword ptr [rax+8]
mov rax,16AD17D7C80
mov rax,[rax]
vmovsd xmm1,qword ptr [rax+8]
vsubps xmm0,xmm0,xmm1
vdpps xmm0,xmm0,xmm0,31
vsqrtss xmm0,xmm0,xmm0
ret
; Total bytes of code 54 .NET 6.0.0 (6.0.21.20503), X64 RyuJIT; System.Numerics.Tests.Perf_Vector2.DistanceBenchmark()
sub rsp,18
vzeroupper
mov rax,13834F694A8
mov rax,[rax]
add rax,8
vmovss xmm0,dword ptr [rax]
vmovss dword ptr [rsp+10],xmm0
vmovss xmm0,dword ptr [rax+4]
vmovss dword ptr [rsp+14],xmm0
mov rax,13834F694B0
mov rax,[rax]
add rax,8
vmovss xmm0,dword ptr [rax]
vmovss dword ptr [rsp+8],xmm0
vmovss xmm0,dword ptr [rax+4]
vmovss dword ptr [rsp+0C],xmm0
vmovsd xmm0,qword ptr [rsp+10]
vmovsd xmm1,qword ptr [rsp+8]
vsubps xmm0,xmm0,xmm1
vdpps xmm0,xmm0,xmm0,31
vsqrtss xmm0,xmm0,xmm0
add rsp,18
ret
; Total bytes of code 114 |
Run Information
Regressions in System.Numerics.Tests.Perf_Matrix3x2
Related Issue on x64 Windows[Perf 16%] System.Numerics.Tests.Perf_Matrix3x2.EqualsBenchmark Related Issue on x86 Windows[Perf 37%] System.Numerics.Tests.Perf_Matrix3x2 (2)
Reprogit clone /~https://github.com/dotnet/performance.git
python3 .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Numerics.Tests.Perf_Matrix3x2*' HistogramSystem.Numerics.Tests.Perf_Matrix3x2.IsIdentityBenchmark
System.Numerics.Tests.Perf_Matrix3x2.CreateScaleFromScalarXYWithCenterBenchmark
DocsProfiling workflow for dotnet/runtime repository |
These tests also regressed over the same commit range. |
.NET 5.0.5 (5.0.521.16609), X64 RyuJIT; System.Numerics.Tests.Perf_Matrix3x2.IsIdentityBenchmark()
sub rsp,38
vxorps xmm4,xmm4,xmm4
vmovdqa xmmword ptr [rsp+20],xmm4
xor eax,eax
mov [rsp+30],rax
lea rcx,[rsp+20]
call System.Numerics.Matrix3x2.get_Identity()
lea rcx,[rsp+20]
call System.Numerics.Matrix3x2.get_IsIdentity()
nop
add rsp,38
ret
; Total bytes of code 47 ; System.Numerics.Matrix3x2.get_Identity()
vzeroupper
mov rax,1ECEDC01420
mov rax,[rax]
vmovdqu xmm0,xmmword ptr [rax+8]
vmovdqu xmmword ptr [rcx],xmm0
mov rdx,[rax+18]
mov [rcx+10],rdx
mov rax,rcx
ret
; Total bytes of code 37 ; System.Numerics.Matrix3x2.get_IsIdentity()
vzeroupper
vmovss xmm0,dword ptr [rcx]
vucomiss xmm0,dword ptr [7FFEC58B2528]
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+0C]
vucomiss xmm0,dword ptr [7FFEC58B252C]
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+4]
vxorps xmm1,xmm1,xmm1
vucomiss xmm0,xmm1
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+8]
vxorps xmm1,xmm1,xmm1
vucomiss xmm0,xmm1
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+10]
vxorps xmm1,xmm1,xmm1
vucomiss xmm0,xmm1
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+14]
vxorps xmm1,xmm1,xmm1
vucomiss xmm0,xmm1
setnp al
jp short M02_L00
sete al
M02_L00:
movzx eax,al
ret
M02_L01:
xor eax,eax
ret
; Total bytes of code 115 .NET 6.0.0 (6.0.21.20503), X64 RyuJIT; System.Numerics.Tests.Perf_Matrix3x2.IsIdentityBenchmark()
sub rsp,98
vzeroupper
vxorps xmm4,xmm4,xmm4
vmovdqa xmmword ptr [rsp+80],xmm4
xor eax,eax
mov [rsp+90],rax
lea rcx,[rsp+80]
call System.Numerics.Matrix3x2.get_Identity()
vmovdqu xmm0,xmmword ptr [rsp+80]
vmovdqu xmmword ptr [rsp+68],xmm0
mov rcx,[rsp+90]
mov [rsp+78],rcx
lea rcx,[rsp+50]
call System.Numerics.Matrix3x2.get_Identity()
vmovdqu xmm0,xmmword ptr [rsp+68]
vmovdqu xmmword ptr [rsp+38],xmm0
mov rcx,[rsp+78]
mov [rsp+48],rcx
vmovdqu xmm0,xmmword ptr [rsp+50]
vmovdqu xmmword ptr [rsp+20],xmm0
mov rcx,[rsp+60]
mov [rsp+30],rcx
lea rcx,[rsp+38]
lea rdx,[rsp+20]
call System.Numerics.Matrix3x2.op_Equality(System.Numerics.Matrix3x2, System.Numerics.Matrix3x2)
movzx eax,al
add rsp,98
ret
; Total bytes of code 154 ; System.Numerics.Matrix3x2.get_Identity()
vzeroupper
mov rax,1DE240E1428
mov rax,[rax]
vmovdqu xmm0,xmmword ptr [rax+8]
vmovdqu xmmword ptr [rcx],xmm0
mov rdx,[rax+18]
mov [rcx+10],rdx
mov rax,rcx
ret
; Total bytes of code 37 ; System.Numerics.Matrix3x2.op_Equality(System.Numerics.Matrix3x2, System.Numerics.Matrix3x2)
vzeroupper
vmovss xmm0,dword ptr [rcx]
vucomiss xmm0,dword ptr [rdx]
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+0C]
vucomiss xmm0,dword ptr [rdx+0C]
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+4]
vucomiss xmm0,dword ptr [rdx+4]
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+8]
vucomiss xmm0,dword ptr [rdx+8]
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+10]
vucomiss xmm0,dword ptr [rdx+10]
jp short M02_L01
jne short M02_L01
vmovss xmm0,dword ptr [rcx+14]
vucomiss xmm0,dword ptr [rdx+14]
setnp al
jp short M02_L00
sete al
M02_L00:
movzx eax,al
ret
M02_L01:
xor eax,eax
ret
; Total bytes of code 96 |
.NET 5.0.5 (5.0.521.16609), X64 RyuJIT; System.Numerics.Tests.Perf_Matrix3x2.CreateScaleFromScalarXYWithCenterBenchmark()
vzeroupper
vmovss xmm2,dword ptr [7FFDC56928A8]
vmovss xmm1,dword ptr [7FFDC56928AC]
vxorps xmm0,xmm0,xmm0
vmovq r9,xmm0
mov rcx,rdx
jmp near ptr 00007FFDC5692750
; Total bytes of code 36 .NET 6.0.0 (6.0.21.20503), X64 RyuJIT; System.Numerics.Tests.Perf_Matrix3x2.CreateScaleFromScalarXYWithCenterBenchmark()
vzeroupper
vmovss xmm2,dword ptr [7FFF0816AEB8]
vmovss xmm1,dword ptr [7FFF0816AEBC]
vxorps xmm0,xmm0,xmm0
vmovq r9,xmm0
mov rcx,rdx
jmp near ptr System.Numerics.Matrix3x2.CreateScale(Single, Single, System.Numerics.Vector2)
; Total bytes of code 36 ; System.Numerics.Matrix3x2.CreateScale(Single, Single, System.Numerics.Vector2)
push rsi
sub rsp,40
vzeroupper
mov [rsp+68],r9
mov rsi,rcx
vmovss dword ptr [rsp+58],xmm1
vmovss dword ptr [rsp+60],xmm2
lea rcx,[rsp+28]
call System.Numerics.Matrix3x2.get_Identity()
vmovss xmm0,dword ptr [7FFF0816AF70]
vmovss xmm1,dword ptr [rsp+58]
vsubss xmm0,xmm0,xmm1
vmulss xmm0,xmm0,dword ptr [rsp+68]
vmovss xmm2,dword ptr [7FFF0816AF70]
vmovss xmm3,dword ptr [rsp+60]
vsubss xmm2,xmm2,xmm3
vmulss xmm2,xmm2,dword ptr [rsp+6C]
vmovss dword ptr [rsp+28],xmm1
vmovss dword ptr [rsp+34],xmm3
vmovss dword ptr [rsp+38],xmm0
vmovss dword ptr [rsp+3C],xmm2
vmovdqu xmm0,xmmword ptr [rsp+28]
vmovdqu xmmword ptr [rsi],xmm0
mov rax,[rsp+38]
mov [rsi+10],rax
mov rax,rsi
add rsp,40
pop rsi
ret
; Total bytes of code 138 |
For
to
So with inlining you would expect zero differences, but they obviously exist 😄 |
@tannergooding the could you please take a look? |
Compared to .NET 5, we still have this regression: System.Numerics.Tests.Perf_Matrix3x2.IsIdentityBenchmark
@tannergooding could you PTAL? |
It's worse for x86: System.Numerics.Tests.Perf_Vector3.DistanceBenchmark
System.Numerics.Tests.Perf_Vector2.DistanceBenchmark
Repro: git clone /~https://github.com/dotnet/performance.git
py .\performance\scripts\benchmarks_ci.py -f net5.0 net6.0 --architecture x86 --filter System.Numerics.Tests.Perf_Vector3.DistanceBenchmark --bdn-arguments "--disasm true" .NET 5.0.9 (5.0.921.35908), X86 RyuJIT; System.Numerics.Tests.Perf_Vector3.DistanceBenchmark()
push eax
vzeroupper
mov eax,ds:[4668]
lea eax,[eax+4]
vmovss xmm0,dword ptr [eax+8]
vmovsd xmm1,qword ptr [eax]
vshufps xmm1,xmm1,xmm0,44
mov eax,ds:[466C]
lea eax,[eax+4]
vmovss xmm0,dword ptr [eax+8]
vmovsd xmm2,qword ptr [eax]
vshufps xmm2,xmm2,xmm0,44
vsubps xmm0,xmm1,xmm2
vdpps xmm0,xmm0,xmm0,71
vsqrtss xmm0,xmm0,xmm0
vmovss dword ptr [esp],xmm0
fld dword ptr [esp]
pop ecx
ret
; Total bytes of code 72 .NET 6.0.0 (6.0.21.41701), X86 RyuJIT; System.Numerics.Tests.Perf_Vector3.DistanceBenchmark()
sub esp,24
vzeroupper
mov eax,ds:[4668]
add eax,4
vmovss xmm0,dword ptr [eax]
vmovss dword ptr [esp+10],xmm0
vmovss xmm0,dword ptr [eax+4]
vmovss dword ptr [esp+14],xmm0
vmovss xmm0,dword ptr [eax+8]
vmovss dword ptr [esp+18],xmm0
mov eax,ds:[466C]
add eax,4
vmovss xmm0,dword ptr [eax]
vmovss dword ptr [esp],xmm0
vmovss xmm0,dword ptr [eax+4]
vmovss dword ptr [esp+4],xmm0
vmovss xmm0,dword ptr [eax+8]
vmovss dword ptr [esp+8],xmm0
vmovupd xmm0,[esp+10]
vmovupd xmm1,[esp]
vsubps xmm0,xmm0,xmm1
vdpps xmm0,xmm0,xmm0,71
vsqrtss xmm0,xmm0,xmm0
vmovss dword ptr [esp+20],xmm0
fld dword ptr [esp+20]
add esp,24
ret
; Total bytes of code 124 |
This is due to the same original change which cleaned up a bunch of the logic in the The JIT is preserving two struct copies that occur from the inlining which is causing the perf regression:
and
It would be simple enough to revert this particular method, but it would also be ideal if this could just be correctly handled and optimized. CC. @dotnet/jit-contrib |
Run Information
Regressions in System.Numerics.Tests.Perf_Vector2
Historical Data in Reporting System
Repro
Histogram
System.Numerics.Tests.Perf_Vector2.DistanceBenchmark
Docs
Profiling workflow for dotnet/runtime repository
Benchmarking workflow for dotnet/runtime repository
The text was updated successfully, but these errors were encountered: