From 7356708970d22dec7e947cda050f0b3216bd55b0 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Wed, 22 Mar 2017 10:22:44 +0000 Subject: [PATCH] md5 benchmark 15% speed by removing "slow LEA" Remove a special case from the assembler that generates "Slow LEA" instructions that can execute poorly on Skylake and Haswell CPUs. The "slow LEA" is one that uses base+index+offset operands. These instructions "have increased latency and reduced dispatch port choices compared to other LEAs." Links: - https://software.intel.com/en-us/node/544484 - http://stackoverflow.com/questions/21288214/what-are-fast-lea-and-slow-lea-unit-in-the-microarchitecture-of-intes-cpu Resolves RaptorJIT/RaptorJIT#54. Here is an example of a "slow LEA" instruction that was emitted before: lea eax, [rbx+rdx+1234] The new replacement avoids the bad case: lea eax, [rbx+1234] add rax, rdx On Haswell and Skylake CPUs this improves the md5 benchmark performance by ~15%. The difference in cycles (time) correlates closely with the difference in slow LEA instructions executed (as reported by the CPU performance monitoring unit.) Before: Performance counter stats for './luajit ../../luajit-test-cleanup/bench/md5.lua 20000': 8,166,721,155 instructions # 2.02 insn per cycle 4,039,743,481 cycles 633,604,974 uops_issued_slow_lea 1.683641631 seconds time elapsed After: 8,463,581,471 instructions # 2.45 insn per cycle 3,454,061,396 cycles 340,049,934 uops_issued_slow_lea --- src/lj_asm_x86.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 5cf08a000c..8abf122a9a 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1710,20 +1710,6 @@ static int asm_lea(ASMState *as, IRIns *ir) } else { return 0; } - } else if (ir->op1 != ir->op2 && irl->o == IR_ADD && mayfuse(as, ir->op1) && - (irref_isk(ir->op2) || irref_isk(irl->op2))) { - Reg idx, base = ra_alloc1(as, irl->op1, allow); - rset_clear(allow, base); - as->mrm.base = (uint8_t)base; - if (irref_isk(ir->op2)) { - as->mrm.ofs = irr->i; - idx = ra_alloc1(as, irl->op2, allow); - } else { - as->mrm.ofs = IR(irl->op2)->i; - idx = ra_alloc1(as, ir->op2, allow); - } - rset_clear(allow, idx); - as->mrm.idx = (uint8_t)idx; } else { return 0; }