Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: improve NULL count estimation for single column index (#9474) #9979

Merged
merged 6 commits into from
Apr 2, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix ut failur
  • Loading branch information
eurekaka committed Apr 2, 2019
commit 2a7ffdedec41903f36f0add1a6bbb145eeaa48bb
4 changes: 2 additions & 2 deletions cmd/explaintest/r/index_join.result
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ insert into t2 values(1, 1);
analyze table t1, t2;
explain select /*+ TIDB_INLJ(t1, t2) */ * from t1 join t2 on t1.a=t2.a;
id count task operator info
IndexJoin_14 1.25 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
IndexJoin_14 5.00 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
├─IndexLookUp_13 5.00 root
│ ├─IndexScan_11 5.00 cop table:t1, index:a, range: decided by [test.t2.a], keep order:false
│ └─TableScan_12 5.00 cop table:t1, keep order:false
└─TableReader_16 1.00 root data:TableScan_15
└─TableScan_15 1.00 cop table:t2, range:[-inf,+inf], keep order:false
explain select * from t1 join t2 on t1.a=t2.a;
id count task operator info
IndexJoin_14 1.25 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
IndexJoin_14 5.00 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
├─IndexLookUp_13 5.00 root
│ ├─IndexScan_11 5.00 cop table:t1, index:a, range: decided by [test.t2.a], keep order:false
│ └─TableScan_12 5.00 cop table:t1, keep order:false
Expand Down
4 changes: 2 additions & 2 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ PARTITION BY RANGE ( a ) (
for _, def := range pi.Definitions {
statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
c.Assert(statsTbl.Pseudo, IsFalse)
c.Assert(len(statsTbl.Columns), Equals, 2)
c.Assert(len(statsTbl.Columns), Equals, 3)
c.Assert(len(statsTbl.Indices), Equals, 1)
for _, col := range statsTbl.Columns {
c.Assert(col.Len(), Greater, 0)
Expand All @@ -81,7 +81,7 @@ PARTITION BY RANGE ( a ) (
statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
if i == 0 {
c.Assert(statsTbl.Pseudo, IsFalse)
c.Assert(len(statsTbl.Columns), Equals, 2)
c.Assert(len(statsTbl.Columns), Equals, 3)
c.Assert(len(statsTbl.Indices), Equals, 1)
} else {
c.Assert(statsTbl.Pseudo, IsTrue)
Expand Down
24 changes: 12 additions & 12 deletions executor/show_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ func (s *testSuite) TestShowStatsHasNullValue(c *C) {
tk.MustExec("insert into t values(1)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
"test t a 0 0 1 1 1 1",
"test t idx 1 0 1 1 1 1",
"test t a 0 0 1 1 1 1",
"test t idx 1 0 1 1 1 1",
))
tk.MustExec("drop table t")
tk.MustExec("create table t (a int, b int, index idx(a, b))")
tk.MustExec("insert into t values(NULL, NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))

tk.MustExec("drop table t")
tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
Expand All @@ -131,13 +131,13 @@ func (s *testSuite) TestShowStatsHasNullValue(c *C) {
tk.MustExec("analyze table t index idx_b")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][7], Equals, "4")
c.Assert(res.Rows()[0][6], Equals, "4")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_c_a")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][7], Equals, "0")
c.Assert(res.Rows()[0][6], Equals, "0")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
c.Assert(len(res.Rows()), Equals, 0)
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
Expand All @@ -149,16 +149,16 @@ func (s *testSuite) TestShowStatsHasNullValue(c *C) {
tk.MustExec("analyze table t index")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 2)
c.Assert(res.Rows()[0][7], Equals, "4")
c.Assert(res.Rows()[1][7], Equals, "0")
c.Assert(res.Rows()[0][6], Equals, "4")
c.Assert(res.Rows()[1][6], Equals, "0")
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
tk.MustExec("analyze table t")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 5)
c.Assert(res.Rows()[0][7], Equals, "1")
c.Assert(res.Rows()[1][7], Equals, "4")
c.Assert(res.Rows()[2][7], Equals, "1")
c.Assert(res.Rows()[3][7], Equals, "4")
c.Assert(res.Rows()[4][7], Equals, "0")
c.Assert(res.Rows()[0][6], Equals, "1")
c.Assert(res.Rows()[1][6], Equals, "4")
c.Assert(res.Rows()[2][6], Equals, "1")
c.Assert(res.Rows()[3][6], Equals, "4")
c.Assert(res.Rows()[4][6], Equals, "0")
}
14 changes: 7 additions & 7 deletions planner/core/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ func (s *testAnalyzeSuite) TestIndexRead(c *C) {
},
{
sql: "select count(*) from t where e > 1 group by b",
best: "IndexLookUp(Index(t.b)[[NULL,+inf]], Table(t)->Sel([gt(test.t.e, 1)]))->StreamAgg",
best: "TableReader(Table(t)->Sel([gt(test.t.e, 1)])->HashAgg)->HashAgg",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reviewers: before this PR, column b is not analyzed because of the existence of index on b, so estimated NDV of b is about 8000; after this PR, estimated NDV of b is 100 which is consistent with data, and the plan is changed due to this NDV change.

},
{
sql: "select count(e) from t where t.b <= 20",
Expand Down Expand Up @@ -453,7 +453,7 @@ func (s *testAnalyzeSuite) TestAnalyze(c *C) {
}{
{
sql: "analyze table t3",
best: "Analyze{Index(a),Table(b)}",
best: "Analyze{Index(a),Table(a, b)}",
},
// Test analyze full table.
{
Expand Down Expand Up @@ -676,11 +676,11 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
" ├─TableReader_12 10.00 root data:TableScan_11",
" │ └─TableScan_11 10.00 cop table:t, range:[-inf,+inf], keep order:false",
" └─MaxOneRow_13 1.00 root ",
" └─Projection_14 0.80 root concat(cast(t1.a), \",\", cast(t1.b))",
" └─IndexLookUp_21 0.80 root ",
" ├─IndexScan_18 1.25 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" └─Selection_20 0.80 cop eq(t1.a, test.t.a)",
" └─TableScan_19 1.25 cop table:t, keep order:false",
" └─Projection_14 0.10 root concat(cast(t1.a), \",\", cast(t1.b))",
" └─IndexLookUp_21 0.10 root ",
" ├─IndexScan_18 1.00 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" └─Selection_20 0.10 cop eq(t1.a, test.t.a)",
" └─TableScan_19 1.00 cop table:t, keep order:false",
))
}

Expand Down
4 changes: 2 additions & 2 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,9 +465,9 @@ func (hg *Histogram) totalRowCount() float64 {
// notNullCount would return same value as totalRowCount for multi-column index histograms.
func (hg *Histogram) notNullCount() float64 {
if hg.Len() == 0 {
return float64(hg.NullCount)
return 0
}
return float64(hg.Buckets[hg.Len()-1].Count + hg.NullCount)
return float64(hg.Buckets[hg.Len()-1].Count)
}

// mergeBuckets is used to merge every two neighbor buckets.
Expand Down
6 changes: 3 additions & 3 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,14 @@ func BenchmarkSelectivity(b *testing.B) {
pprof.StopCPUProfile()
}

func (s *testStatsSuite) TestColumnIndexNullEstimation(c *C) {
defer cleanEnv(c, s.store, s.do)
func (s *testSelectivitySuite) TestColumnIndexNullEstimation(c *C) {
defer cleanEnv(c, s.store, s.dom)
testKit := testkit.NewTestKit(c, s.store)
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
testKit.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null);")
h := s.do.StatsHandle()
h := s.dom.StatsHandle()
c.Assert(h.DumpStatsDeltaToKV(statistics.DumpAll), IsNil)
testKit.MustExec("analyze table t")
testKit.MustQuery(`explain select b from t where b is null`).Check(testkit.Rows(
Expand Down
2 changes: 1 addition & 1 deletion statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ func (coll *HistColl) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, co
// GetRowCountByIndexRanges estimates the row count by a slice of Range.
func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) {
idx := coll.Indices[idxID]
if idx == nil || coll.Pseudo && idx.NotAccurate() || idx.Len() == 0 {
if idx == nil || coll.Pseudo && idx.NotAccurate() || idx.totalRowCount() == 0 {
colsLen := -1
if idx != nil && idx.Info.Unique {
colsLen = len(idx.Info.Columns)
Expand Down