Reapply [InstSimplify] Remove known bits constant folding

No changes relative to last time, but after a mitigation for an AMDGPU regression landed. --- If SimplifyInstruction() does not succeed in simplifying the instruction, it will compute the known bits of the instruction in the hope that all bits are known and the instruction can be folded to a constant. I have removed a similar optimization from InstCombine in D75801, and would like to drop this one as well. On average, we spend ~1% of total compile-time performing this known bits calculation. However, if we introduce some additional statistics for known bits computations and how many of them succeed in simplifying the instruction we get (on test-suite): instsimplify.NumKnownBits: 216 instsimplify.NumKnownBitsComputed: 13828375 valuetracking.NumKnownBitsComputed: 45860806 Out of ~14M known bits calculations (accounting for approximately one third of all known bits calculations), only 0.0015% succeed in producing a constant. Those cases where we do succeed to compute all known bits will get folded by other passes like InstCombine later. On test-suite, only lencod.test and GCC-C-execute-pr44858.test show a hash difference after this change. On lencod we see an improvement (a loop phi is optimized away), on the GCC torture test a regression (a function return value is determined only after IPSCCP, preventing propagation from a noinline function.) There are various regressions in InstSimplify tests. However, all of these cases are already handled by InstCombine, and corresponding tests have already been added there. Differential Revision: https://reviews.llvm.org/D79294
llvm · May 8, 2020 · 5a22656 · 5a22656
1 parent 989ae9e
commit 5a22656
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 123 deletions.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5600,9 +5600,6 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
     break;
   case Instruction::Call: {
     Result = SimplifyCall(cast<CallInst>(I), Q);
-    // Don't perform known bits simplification below for musttail calls.
-    if (cast<CallInst>(I)->isMustTailCall())
-      return Result;
     break;
   }
   case Instruction::Freeze:
@@ -5620,14 +5617,6 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
     break;
   }
 
-  // In general, it is possible for computeKnownBits to determine all bits in a
-  // value even when the operands are not all constants.
-  if (!Result && I->getType()->isIntOrIntVectorTy()) {
-    KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
-    if (Known.isConstant())
-      Result = ConstantInt::get(I->getType(), Known.getConstant());
-  }
-
   /// If called on unreachable code, the above logic may report that the
   /// instruction simplified to itself.  Make life easier for users by
   /// detecting that case here, returning a safe value instead.

diff --git a/llvm/test/Analysis/ValueTracking/knownzero-shift.ll b/llvm/test/Analysis/ValueTracking/knownzero-shift.ll
@@ -15,9 +15,15 @@ define i1 @test(i8 %p, i8* %pq) {
 
 !0 = !{ i8 1, i8 5 }
 
+; The following cases only get folded by InstCombine,
+; see InstCombine/shift-shift.ll. If we wanted to,
+; we could explicitly handle them in InstSimplify as well.
+
 define i32 @shl_shl(i32 %A) {
 ; CHECK-LABEL: @shl_shl(
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[B:%.*]] = shl i32 [[A:%.*]], 6
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[B]], 28
+; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = shl i32 %A, 6
   %C = shl i32 %B, 28
@@ -26,7 +32,9 @@ define i32 @shl_shl(i32 %A) {
 
 define <2 x i33> @shl_shl_splat_vec(<2 x i33> %A) {
 ; CHECK-LABEL: @shl_shl_splat_vec(
-; CHECK-NEXT:    ret <2 x i33> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shl <2 x i33> [[A:%.*]], <i33 5, i33 5>
+; CHECK-NEXT:    [[C:%.*]] = shl <2 x i33> [[B]], <i33 28, i33 28>
+; CHECK-NEXT:    ret <2 x i33> [[C]]
 ;
   %B = shl <2 x i33> %A, <i33 5, i33 5>
   %C = shl <2 x i33> %B, <i33 28, i33 28>
@@ -37,7 +45,7 @@ define <2 x i33> @shl_shl_splat_vec(<2 x i33> %A) {
 
 define <2 x i33> @shl_shl_vec(<2 x i33> %A) {
 ; CHECK-LABEL: @shl_shl_vec(
-; CHECK-NEXT:    [[B:%.*]] = shl <2 x i33> %A, <i33 6, i33 5>
+; CHECK-NEXT:    [[B:%.*]] = shl <2 x i33> [[A:%.*]], <i33 6, i33 5>
 ; CHECK-NEXT:    [[C:%.*]] = shl <2 x i33> [[B]], <i33 27, i33 28>
 ; CHECK-NEXT:    ret <2 x i33> [[C]]
 ;
@@ -48,7 +56,9 @@ define <2 x i33> @shl_shl_vec(<2 x i33> %A) {
 
 define i232 @lshr_lshr(i232 %A) {
 ; CHECK-LABEL: @lshr_lshr(
-; CHECK-NEXT:    ret i232 0
+; CHECK-NEXT:    [[B:%.*]] = lshr i232 [[A:%.*]], 231
+; CHECK-NEXT:    [[C:%.*]] = lshr i232 [[B]], 1
+; CHECK-NEXT:    ret i232 [[C]]
 ;
   %B = lshr i232 %A, 231
   %C = lshr i232 %B, 1
@@ -57,7 +67,9 @@ define i232 @lshr_lshr(i232 %A) {
 
 define <2 x i32> @lshr_lshr_splat_vec(<2 x i32> %A) {
 ; CHECK-LABEL: @lshr_lshr_splat_vec(
-; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i32> [[A:%.*]], <i32 28, i32 28>
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 4, i32 4>
+; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = lshr <2 x i32> %A, <i32 28, i32 28>
   %C = lshr <2 x i32> %B, <i32 4, i32 4>
@@ -66,7 +78,9 @@ define <2 x i32> @lshr_lshr_splat_vec(<2 x i32> %A) {
 
 define <2 x i32> @lshr_lshr_vec(<2 x i32> %A) {
 ; CHECK-LABEL: @lshr_lshr_vec(
-; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i32> [[A:%.*]], <i32 29, i32 28>
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 4, i32 5>
+; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = lshr <2 x i32> %A, <i32 29, i32 28>
   %C = lshr <2 x i32> %B, <i32 4, i32 5>

diff --git a/llvm/test/Transforms/GVN/PRE/volatile.ll b/llvm/test/Transforms/GVN/PRE/volatile.ll
@@ -197,14 +197,17 @@ exit:
   ret i32 %add
 }
 
+; This test checks that we don't optimize away instructions that are
+; simplified by SimplifyInstruction(), but are not trivially dead.
+
 define i32 @test9(i32* %V) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32* [[V:%.*]], !range !0
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[LOAD:%.*]] = call i32 undef()
+; CHECK-NEXT:    ret i32 undef
 ;
 entry:
-  %load = load volatile i32, i32* %V, !range !0
+  %load = call i32 undef()
   ret i32 %load
 }
 

diff --git a/llvm/test/Transforms/InstSimplify/assume.ll b/llvm/test/Transforms/InstSimplify/assume.ll
diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll
@@ -988,7 +988,7 @@ declare i8* @passthru_p8(i8* returned)
 define i32 @returned_const_int_arg() {
 ; CHECK-LABEL: @returned_const_int_arg(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @passthru_i32(i32 42)
-; CHECK-NEXT:    ret i32 42
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = call i32 @passthru_i32(i32 42)
   ret i32 %x

diff --git a/llvm/test/Transforms/InstSimplify/or.ll b/llvm/test/Transforms/InstSimplify/or.ll
@@ -98,10 +98,17 @@ define i8 @test10(i8 %A) {
   ret i8 %D
 }
 
+; The following two cases only get folded by InstCombine,
+; see InstCombine/or-xor.ll.
+
 ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
 define i8 @test11(i8 %A) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    ret i8 -1
+; CHECK-NEXT:    [[B:%.*]] = or i8 [[A:%.*]], -2
+; CHECK-NEXT:    [[C:%.*]] = xor i8 [[B]], 13
+; CHECK-NEXT:    [[D:%.*]] = or i8 [[C]], 1
+; CHECK-NEXT:    [[E:%.*]] = xor i8 [[D]], 12
+; CHECK-NEXT:    ret i8 [[E]]
 ;
   %B = or i8 %A, -2
   %C = xor i8 %B, 13
@@ -112,7 +119,12 @@ define i8 @test11(i8 %A) {
 
 define i8 @test11v(<2 x i8> %A) {
 ; CHECK-LABEL: @test11v(
-; CHECK-NEXT:    ret i8 -1
+; CHECK-NEXT:    [[B:%.*]] = or <2 x i8> [[A:%.*]], <i8 -2, i8 0>
+; CHECK-NEXT:    [[CV:%.*]] = xor <2 x i8> [[B]], <i8 13, i8 13>
+; CHECK-NEXT:    [[C:%.*]] = extractelement <2 x i8> [[CV]], i32 0
+; CHECK-NEXT:    [[D:%.*]] = or i8 [[C]], 1
+; CHECK-NEXT:    [[E:%.*]] = xor i8 [[D]], 12
+; CHECK-NEXT:    ret i8 [[E]]
 ;
   %B = or <2 x i8> %A, <i8 -2, i8 0>
   %CV = xor <2 x i8> %B, <i8 13, i8 13>

diff --git a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
@@ -145,7 +145,8 @@ define i1 @shl_i1(i1 %a, i1 %b) {
   ret i1 %shl
 }
 
-; Simplify count leading/trailing zeros to zero if all valid bits are shifted out.
+; The following cases only get folded by InstCombine,
+; see InstCombine/lshr.ll.
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
@@ -154,7 +155,9 @@ declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
 
 define i32 @lshr_ctlz_zero_is_undef(i32 %x) {
 ; CHECK-LABEL: @lshr_ctlz_zero_is_undef(
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[CT:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr i32 [[CT]], 5
+; CHECK-NEXT:    ret i32 [[SH]]
 ;
   %ct = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
   %sh = lshr i32 %ct, 5
@@ -163,7 +166,9 @@ define i32 @lshr_ctlz_zero_is_undef(i32 %x) {
 
 define i32 @lshr_cttz_zero_is_undef(i32 %x) {
 ; CHECK-LABEL: @lshr_cttz_zero_is_undef(
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[CT:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr i32 [[CT]], 5
+; CHECK-NEXT:    ret i32 [[SH]]
 ;
   %ct = call i32 @llvm.cttz.i32(i32 %x, i1 true)
   %sh = lshr i32 %ct, 5
@@ -172,7 +177,9 @@ define i32 @lshr_cttz_zero_is_undef(i32 %x) {
 
 define <2 x i8> @lshr_ctlz_zero_is_undef_splat_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_ctlz_zero_is_undef_splat_vec(
-; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 3>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
 ;
   %ct = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
@@ -181,7 +188,10 @@ define <2 x i8> @lshr_ctlz_zero_is_undef_splat_vec(<2 x i8> %x) {
 
 define i8 @lshr_ctlz_zero_is_undef_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_ctlz_zero_is_undef_vec(
-; CHECK-NEXT:    ret i8 0
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 0>
+; CHECK-NEXT:    [[EX:%.*]] = extractelement <2 x i8> [[SH]], i32 0
+; CHECK-NEXT:    ret i8 [[EX]]
 ;
   %ct = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 0>
@@ -191,7 +201,9 @@ define i8 @lshr_ctlz_zero_is_undef_vec(<2 x i8> %x) {
 
 define <2 x i8> @lshr_cttz_zero_is_undef_splat_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_cttz_zero_is_undef_splat_vec(
-; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 3>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
 ;
   %ct = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
@@ -200,7 +212,10 @@ define <2 x i8> @lshr_cttz_zero_is_undef_splat_vec(<2 x i8> %x) {
 
 define i8 @lshr_cttz_zero_is_undef_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_cttz_zero_is_undef_vec(
-; CHECK-NEXT:    ret i8 0
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 0>
+; CHECK-NEXT:    [[EX:%.*]] = extractelement <2 x i8> [[SH]], i32 0
+; CHECK-NEXT:    ret i8 [[EX]]
 ;
   %ct = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 0>