Skip to content

Commit

Permalink
MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB hist…
Browse files Browse the repository at this point in the history
…ogram

Variant #2.

When Histogram::point_selectivity() sees that the point value of interest
falls into one bucket, it tries to guess whether the bucket has many
different (unpopular) values or a few popular values. (The number of
rows is fixed, as it's a Height-balanced histogram).
The basis for this guess is the "width" of the value range the bucket
covers. Buckets covering wider value ranges are assumed to contain
values with proportionally lower frequencies.

This is just a [brave] guesswork. For a very narrow bucket, it may
produce an estimate that's larger than total #rows in the bucket
or even in the whole table.

Remove the guesswork and replace it with basic logic: return
either the per-table average selectivity of col=const, or selectivity
of one bucket, whichever is lower.
  • Loading branch information
spetrunia committed Apr 28, 2023
1 parent bc97057 commit 85cc831
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 53 deletions.
77 changes: 73 additions & 4 deletions mysql-test/main/selectivity.result
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.78 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
drop table t1;
Expand Down Expand Up @@ -1649,7 +1649,7 @@ test.t1 analyze status Table is already up to date
# Check what info the optimizer has about selectivities
explain extended select * from t1 use index () where a in (17,51,5);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
explain extended select * from t1 use index () where b=2;
Expand Down Expand Up @@ -1935,9 +1935,78 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
DROP TABLE t1;
# End of 10.2 tests
#
# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
#
create table t0(a int);
insert into t0 select 1 from seq_1_to_78;
create table t1(a int);
insert into t1 select 1 from seq_1_to_26;
create table t10 (a int);
insert into t10 select 0 from t0, seq_1_to_4;
insert into t10 select 8693 from t1;
insert into t10 select 8694 from t1;
insert into t10 select 8695 from t1;
insert into t10 select 34783 from t1;
insert into t10 select 34784 from t1;
insert into t10 select 34785 from t1;
insert into t10 select 34785 from t0, seq_1_to_8;
insert into t10 select 65214 from t1;
insert into t10 select 65215 from t1;
insert into t10 select 65216 from t1;
insert into t10 select 65216 from t0, seq_1_to_52;
insert into t10 select 65217 from t1;
insert into t10 select 65218 from t1;
insert into t10 select 65219 from t1;
insert into t10 select 65219 from t0;
insert into t10 select 73913 from t1;
insert into t10 select 73914 from t1;
insert into t10 select 73915 from t1;
insert into t10 select 73915 from t0, seq_1_to_40;
insert into t10 select 78257 from t1;
insert into t10 select 78258 from t1;
insert into t10 select 78259 from t1;
insert into t10 select 91300 from t1;
insert into t10 select 91301 from t1;
insert into t10 select 91302 from t1;
insert into t10 select 91302 from t0, seq_1_to_6;
insert into t10 select 91303 from t1;
insert into t10 select 91304 from t1;
insert into t10 select 91305 from t1;
insert into t10 select 91305 from t0, seq_1_to_8;
insert into t10 select 99998 from t1;
insert into t10 select 99999 from t1;
insert into t10 select 100000 from t1;
set use_stat_tables=preferably;
analyze table t10 persistent for all;
Table Op Msg_type Msg_text
test.t10 analyze status Engine-independent statistics collected
test.t10 analyze status OK
flush tables;
set @tmp=@@optimizer_trace;
set optimizer_trace=1;
explain select * from t10 where a in (91303);
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where
# Must have selectivity_from_histogram <= 1.0:
select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
from information_schema.optimizer_trace;
json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
[
[
{
"column_name": "a",
"ranges":
["91303 <= a <= 91303"],
"selectivity_from_histogram": 0.0357
}
]
]
set optimizer_trace=@tmp;
drop table t0,t1,t10;
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
set histogram_size=@save_histogram_size;
set use_stat_tables= @save_use_stat_tables;
DROP TABLE t1;
# End of 10.2 tests
set @@global.histogram_size=@save_histogram_size;
85 changes: 82 additions & 3 deletions mysql-test/main/selectivity.test
Original file line number Diff line number Diff line change
Expand Up @@ -1319,14 +1319,93 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
FLUSH TABLES;

EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
set histogram_size=@save_histogram_size;
set use_stat_tables= @save_use_stat_tables;

DROP TABLE t1;

--echo # End of 10.2 tests

--echo #
--echo # MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
--echo #
create table t0(a int); # This holds how many rows we hold in a bucket.
insert into t0 select 1 from seq_1_to_78;

create table t1(a int); # one-third of a bucket
insert into t1 select 1 from seq_1_to_26;

create table t10 (a int);
insert into t10 select 0 from t0, seq_1_to_4;

insert into t10 select 8693 from t1;
insert into t10 select 8694 from t1;
insert into t10 select 8695 from t1;


insert into t10 select 34783 from t1;
insert into t10 select 34784 from t1;
insert into t10 select 34785 from t1;


insert into t10 select 34785 from t0, seq_1_to_8;

insert into t10 select 65214 from t1;
insert into t10 select 65215 from t1;
insert into t10 select 65216 from t1;

insert into t10 select 65216 from t0, seq_1_to_52;

insert into t10 select 65217 from t1;
insert into t10 select 65218 from t1;
insert into t10 select 65219 from t1;

insert into t10 select 65219 from t0;


insert into t10 select 73913 from t1;
insert into t10 select 73914 from t1;
insert into t10 select 73915 from t1;

insert into t10 select 73915 from t0, seq_1_to_40;


insert into t10 select 78257 from t1;
insert into t10 select 78258 from t1;
insert into t10 select 78259 from t1;

insert into t10 select 91300 from t1;
insert into t10 select 91301 from t1;
insert into t10 select 91302 from t1;

insert into t10 select 91302 from t0, seq_1_to_6;

insert into t10 select 91303 from t1; # Only 1/3rd of bucket matches the search tuple
insert into t10 select 91304 from t1;
insert into t10 select 91305 from t1;

insert into t10 select 91305 from t0, seq_1_to_8;

insert into t10 select 99998 from t1;
insert into t10 select 99999 from t1;
insert into t10 select 100000 from t1;

set use_stat_tables=preferably;
analyze table t10 persistent for all;
flush tables;

set @tmp=@@optimizer_trace;
set optimizer_trace=1;
explain select * from t10 where a in (91303);

--echo # Must have selectivity_from_histogram <= 1.0:
select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
from information_schema.optimizer_trace;

set optimizer_trace=@tmp;
drop table t0,t1,t10;

set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
set histogram_size=@save_histogram_size;
set use_stat_tables= @save_use_stat_tables;
#
# Clean up
#
Expand Down
75 changes: 71 additions & 4 deletions mysql-test/main/selectivity_innodb.result
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.78 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
drop table t1;
Expand Down Expand Up @@ -1659,7 +1659,7 @@ test.t1 analyze status OK
# Check what info the optimizer has about selectivities
explain extended select * from t1 use index () where a in (17,51,5);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
explain extended select * from t1 use index () where b=2;
Expand Down Expand Up @@ -1945,11 +1945,78 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
DROP TABLE t1;
# End of 10.2 tests
#
# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
#
create table t0(a int);
insert into t0 select 1 from seq_1_to_78;
create table t1(a int);
insert into t1 select 1 from seq_1_to_26;
create table t10 (a int);
insert into t10 select 0 from t0, seq_1_to_4;
insert into t10 select 8693 from t1;
insert into t10 select 8694 from t1;
insert into t10 select 8695 from t1;
insert into t10 select 34783 from t1;
insert into t10 select 34784 from t1;
insert into t10 select 34785 from t1;
insert into t10 select 34785 from t0, seq_1_to_8;
insert into t10 select 65214 from t1;
insert into t10 select 65215 from t1;
insert into t10 select 65216 from t1;
insert into t10 select 65216 from t0, seq_1_to_52;
insert into t10 select 65217 from t1;
insert into t10 select 65218 from t1;
insert into t10 select 65219 from t1;
insert into t10 select 65219 from t0;
insert into t10 select 73913 from t1;
insert into t10 select 73914 from t1;
insert into t10 select 73915 from t1;
insert into t10 select 73915 from t0, seq_1_to_40;
insert into t10 select 78257 from t1;
insert into t10 select 78258 from t1;
insert into t10 select 78259 from t1;
insert into t10 select 91300 from t1;
insert into t10 select 91301 from t1;
insert into t10 select 91302 from t1;
insert into t10 select 91302 from t0, seq_1_to_6;
insert into t10 select 91303 from t1;
insert into t10 select 91304 from t1;
insert into t10 select 91305 from t1;
insert into t10 select 91305 from t0, seq_1_to_8;
insert into t10 select 99998 from t1;
insert into t10 select 99999 from t1;
insert into t10 select 100000 from t1;
set use_stat_tables=preferably;
analyze table t10 persistent for all;
Table Op Msg_type Msg_text
test.t10 analyze status Engine-independent statistics collected
test.t10 analyze status OK
flush tables;
set statement optimizer_trace=1 for
explain select * from t10 where a in (91303);
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where
# Must have selectivity_from_histogram <= 1.0:
select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
from information_schema.optimizer_trace;
json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
[
[
{
"column_name": "a",
"ranges":
["91303 <= a <= 91303"],
"selectivity_from_histogram": 0.035714283
}
]
]
drop table t0,t1,t10;
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
set histogram_size=@save_histogram_size;
set use_stat_tables= @save_use_stat_tables;
DROP TABLE t1;
# End of 10.2 tests
set @@global.histogram_size=@save_histogram_size;
set optimizer_switch=@save_optimizer_switch_for_selectivity_test;
set @tmp_ust= @@use_stat_tables;
Expand Down
6 changes: 3 additions & 3 deletions mysql-test/main/selectivity_no_engine.result
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ test.t2 analyze status OK
# The following two must have the same in 'Extra' column:
explain extended select * from t2 where col1 IN (20, 180);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.00 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (20,180)
explain extended select * from t2 where col1 IN (180, 20);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.00 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (180,20)
drop table t1, t2;
Expand Down Expand Up @@ -102,7 +102,7 @@ test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
explain extended select * from t1 where col1 in (1,2,3);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where
1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 2.97 Using where
Warnings:
Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where `test`.`t1`.`col1` in (1,2,3)
# Must not cause fp division by zero, or produce nonsense numbers:
Expand Down
44 changes: 5 additions & 39 deletions sql/sql_statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3902,50 +3902,16 @@ double Histogram::point_selectivity(double pos, double avg_sel)
}
else
{
/*
/*
The value 'pos' fits within one single histogram bucket.
Histogram buckets have the same numbers of rows, but they cover
different ranges of values.
We assume that values are uniformly distributed across the [0..1] value
range.
*/

/*
If all buckets covered value ranges of the same size, the width of
value range would be:
We also have avg_sel which is per-table average selectivity of col=const.
If there are popular values, this may be larger than one bucket, so
cap the returned number by the selectivity of one bucket.
*/
double avg_bucket_width= 1.0 / (get_width() + 1);

/*
Let's see what is the width of value range that our bucket is covering.
(min==max currently. they are kept in the formula just in case we
will want to extend it to handle multi-bucket case)
*/
double inv_prec_factor= (double) 1.0 / prec_factor();
double current_bucket_width=
(max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
(min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));

DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */

/*
So:
- each bucket has the same #rows
- values are unformly distributed across the [min_value,max_value] domain.

If a bucket has value range that's N times bigger then average, than
each value will have to have N times fewer rows than average.
*/
sel= avg_sel * avg_bucket_width / current_bucket_width;

/*
(Q: if we just follow this proportion we may end up in a situation
where number of different values we expect to find in this bucket
exceeds the number of rows that this histogram has in a bucket. Are
we ok with this or we would want to have certain caps?)
*/
sel= MY_MIN(avg_bucket_width, avg_sel);
}
return sel;
}
Expand Down

0 comments on commit 85cc831

Please sign in to comment.