diff --git a/mysql-test/r/mysqld--help-notwin.result b/mysql-test/r/mysqld--help-notwin.result index 94ee3f0b8423..58fd27026d5d 100644 --- a/mysql-test/r/mysqld--help-notwin.result +++ b/mysql-test/r/mysqld--help-notwin.result @@ -1995,6 +1995,9 @@ The following options may be given as the first argument: INFORMATION_SCHEMA.ROCKSDB_COMPACTION_HISTORY table. --rocksdb-max-latest-deadlocks=# Maximum number of recent deadlocks to store + --rocksdb-max-lock-memory=# + Range-locking mode: Maximum amount of memory that locks + from all transactions can use at a time --rocksdb-max-log-file-size=# DBOptions::max_log_file_size for RocksDB --rocksdb-max-manifest-file-size=# @@ -2207,6 +2210,10 @@ The following options may be given as the first argument: --rocksdb-use-direct-reads DBOptions::use_direct_reads for RocksDB --rocksdb-use-fsync DBOptions::use_fsync for RocksDB + --rocksdb-use-range-lock-manager-as-point + Use Range Lock Manager as point + --rocksdb-use-range-locking + Use Range Locking --rocksdb-validate-tables=# Verify all DD tables match all RocksDB tables (0 means no verification, 1 means verify and fail on error, and 2 @@ -3351,6 +3358,7 @@ rocksdb-max-background-jobs 2 rocksdb-max-bottom-pri-background-compactions 0 rocksdb-max-compaction-history 64 rocksdb-max-latest-deadlocks 5 +rocksdb-max-lock-memory 1073741824 rocksdb-max-log-file-size 0 rocksdb-max-manifest-file-size 1073741824 rocksdb-max-manual-compactions 10 @@ -3424,6 +3432,8 @@ rocksdb-use-default-sk-cf FALSE rocksdb-use-direct-io-for-flush-and-compaction FALSE rocksdb-use-direct-reads FALSE rocksdb-use-fsync FALSE +rocksdb-use-range-lock-manager-as-point FALSE +rocksdb-use-range-locking FALSE rocksdb-validate-tables 1 rocksdb-verify-row-debug-checksums FALSE rocksdb-wal-bytes-per-sync 0 diff --git a/mysql-test/suite/rocksdb/combinations b/mysql-test/suite/rocksdb/combinations index e2f66bc7ae33..348bbf4b7ff8 100644 --- a/mysql-test/suite/rocksdb/combinations +++ b/mysql-test/suite/rocksdb/combinations @@ -11,3 +11,6 @@ rocksdb_write_batch_flush_threshold=1 [write_committed_with_delete_range] rocksdb_write_policy=write_committed rocksdb_enable_delete_range_for_drop_index = ON + +[range_locking] +rocksdb_use_range_locking=1 diff --git a/mysql-test/suite/rocksdb/include/have_range_locking.inc b/mysql-test/suite/rocksdb/include/have_range_locking.inc new file mode 100644 index 000000000000..a8600daea77a --- /dev/null +++ b/mysql-test/suite/rocksdb/include/have_range_locking.inc @@ -0,0 +1,3 @@ +if (`select count(*) = 0 from performance_schema.session_variables where variable_name = 'rocksdb_use_range_locking' and variable_value = 'ON';`) { + --skip Test requires range locking +} diff --git a/mysql-test/suite/rocksdb/include/not_range_locking.inc b/mysql-test/suite/rocksdb/include/not_range_locking.inc new file mode 100644 index 000000000000..62c26b134bce --- /dev/null +++ b/mysql-test/suite/rocksdb/include/not_range_locking.inc @@ -0,0 +1,5 @@ +--let $_use_range_locking= `select @@rocksdb_use_range_locking` +if ($_use_range_locking == 1) +{ + --skip Test doesn't support range locking +} diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc new file mode 100644 index 000000000000..cc6a328e566b --- /dev/null +++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc @@ -0,0 +1,99 @@ +--echo # select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +# +# An include to print contents of I_S.ROCKSB_LOCKS +# +# Implicit "parameters" +# - Currently it prints locks on t1.PRIMARY +# +# Explicit "parameter" variables: +# - $TRX1_ID - print this transaction as "TRX1" +# - $TRX2_ID - print this transaction as "TRX2" +# +# - $select_from_is_rowlocks_current_trx_only +# - $order_by_rowkey +# +# - $SECOND_INDEX_NAME + +--disable_query_log +set @cf_id=(select column_family from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); +set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx + where thread_id=connection_id()); +set @indexnr= (select lower(lpad(hex(index_number),8,'0')) from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); + +set @indexnr_next= (select lower(lpad(hex(index_number+1),8,'0')) + from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); + +let $extra_where = where 1; + +if ($select_from_is_rowlocks_current_trx_only) +{ + let $extra_where = where transaction_id=(select transaction_id from information_schema.rocksdb_trx where connection_id()=thread_id); +} + +## transaction column + +# If TRX1_ID is not specified, get the current transaction: +let $transaction_col= replace(transaction_id, @rtrx_id, "\$trx_id"); +if ($TRX1_ID) +{ + let $transaction_col = replace(transaction_id, '$TRX1_ID', "\$TRX1_ID"); +} + +if ($TRX2_ID) +{ + let $transaction_col = replace($transaction_col, '$TRX2_ID', "\$TRX2_ID"); +} + +## CF_ID column +let $cf_id_col= column_family_id; + +if ($SECOND_INDEX_NAME) +{ + eval set @cf2_id=(select column_family from information_schema.rocksdb_ddl + where table_name='t1' and index_name='$SECOND_INDEX_NAME'); + + let $cf_id_col= replace($cf_id_col, @cf2_id, "\$cf2_id"); +} +let $cf_id_col= replace($cf_id_col, @cf_id, "\$cf_id"); + +## KEY column +let $key_col= (`key`); +if ($SECOND_INDEX_NAME) +{ + eval set @indexnr2= (select lower(lpad(hex(index_number),8,'0')) + from information_schema.rocksdb_ddl + where table_name='t1' and index_name='$SECOND_INDEX_NAME'); + + eval set @indexnr2_next= (select lower(lpad(hex(index_number+1),8,'0')) + from information_schema.rocksdb_ddl + where table_name='t1' and index_name='$SECOND_INDEX_NAME'); + + let $key_col = replace($key_col, @indexnr2, '\${indexnr2}'); + let $key_col = replace($key_col, @indexnr2_next, '\${indexnr2+1}'); +} + +let $key_col = replace($key_col, @indexnr, '\${indexnr}'); +let $key_col = replace($key_col, @indexnr_next, '\${indexnr+1}'); + +## ORDER BY +if ($order_by_rowkey) +{ + let $extra_order_by = ORDER BY 3,2; +} + +if (!$order_by_rowkey) +{ + --sorted_result +} + +eval select + $cf_id_col as COLUMN_FAMILY_ID, + $transaction_col as TRANSACTION_ID, + $key_col as `KEY`, + mode +from information_schema.rocksdb_locks $extra_where $extra_order_by; + +--enable_query_log diff --git a/mysql-test/suite/rocksdb/r/hermitage-range_locking.result b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result new file mode 100644 index 000000000000..3938fa38b6cb --- /dev/null +++ b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result @@ -0,0 +1,652 @@ +DROP TABLE IF EXISTS test; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connect con3,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connection con1; +create table test (id int primary key, value int) engine=rocksdb; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test; +id value +1 10 +2 20 +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +rollback; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +commit; +connection con2; +select * from test; +id value +1 11 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 22 where id = 2; +connection con1; +select * from test where id = 2; +id value +2 20 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +update test set value = 19 where id = 2; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +update test set value = 18 where id = 2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +commit; +connection con3; +select * from test; +id value +1 12 +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value = 30; +id value +connection con2; +insert into test (id, value) values(3, 30); +commit; +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = value + 10; +connection con2; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors'; +select * from test; +id value +1 10 +2 20 +delete from test where value = 20; +connection con1; +commit; +connection con2; +select * from test; +id value +2 30 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +select * from test; +id value +1 12 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +select * from test where id = 2; +id value +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +select * from test where id = 2; +id value +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 5 = 0; +id value +1 10 +2 20 +connection con2; +update test set value = 12 where value = 10; +commit; +connection con1; +select * from test where value % 3 = 0; +id value +1 12 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test; +id value +1 10 +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +delete from test where value = 20; +select * from test where id = 2; +id value +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con2; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 21 where id = 2; +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 3 = 0; +id value +connection con2; +select * from test where value % 3 = 0; +id value +connection con1; +insert into test (id, value) values(3, 30); +connection con2; +insert into test (id, value) values(4, 42); +connection con1; +commit; +connection con2; +commit; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection default; +drop table test; +disconnect con1; +disconnect con2; +disconnect con3; +DROP TABLE IF EXISTS test; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con3,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +create table test (id int primary key, value int) engine=rocksdb; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test; +id value +1 10 +2 20 +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +rollback; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +commit; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 22 where id = 2; +connection con1; +select * from test where id = 2; +id value +2 20 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +update test set value = 19 where id = 2; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +update test set value = 18 where id = 2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +commit; +connection con3; +select * from test; +id value +1 11 +2 19 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value = 30; +id value +connection con2; +insert into test (id, value) values(3, 30); +commit; +connection con1; +select * from test where value % 3 = 0; +id value +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = value + 10; +connection con2; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors'; +select * from test; +id value +1 10 +2 20 +delete from test where value = 20; +connection con1; +commit; +connection con2; +select * from test; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +select * from test; +id value +1 12 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +select * from test where id = 2; +id value +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +select * from test where id = 2; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 5 = 0; +id value +1 10 +2 20 +connection con2; +update test set value = 12 where value = 10; +commit; +connection con1; +select * from test where value % 3 = 0; +id value +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test; +id value +1 10 +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +delete from test where value = 20; +select * from test where id = 2; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con2; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 21 where id = 2; +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 3 = 0; +id value +connection con2; +select * from test where value % 3 = 0; +id value +connection con1; +insert into test (id, value) values(3, 30); +connection con2; +insert into test (id, value) values(4, 42); +connection con1; +commit; +connection con2; +commit; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection default; +drop table test; +disconnect con1; +disconnect con2; +disconnect con3; diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result new file mode 100644 index 000000000000..b48535c5ee6c --- /dev/null +++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result @@ -0,0 +1,182 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 ( +id INT, +val1 INT, +val2 INT, +PRIMARY KEY (id) +) ENGINE=rocksdb; +INSERT INTO t1 VALUES(1,1,1),(2,1,2); +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 1 2 +UPDATE t1 SET val1=2 WHERE id=2; +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t1 VALUES(20,1,1),(30,30,30); +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +20 1 1 +30 30 30 +UPDATE t1 SET val1=20, val2=20 WHERE id=20; +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +20 20 20 +30 30 30 +DELETE FROM t1 WHERE id=30; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +---SNAPSHOT, ACTIVE NUM sec +MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION +SHOW ENGINE rocksdb TRANSACTION STATUS +lock count 4, write count 4 +insert count 2, update count 1, delete count 1 +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +START TRANSACTION; +INSERT INTO t1 VALUES(40,40,40); +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +COMMIT; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +SET AUTOCOMMIT=1; +DROP TABLE t1; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2 ( +id1 INT, +id2 INT, +value INT, +PRIMARY KEY (id1), +KEY (id2) +) ENGINE=rocksdb; +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t2 VALUES(1,2,0),(10,20,30); +UPDATE t2 SET value=3 WHERE id2=2; +DELETE FROM t2 WHERE id1=10; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SET AUTOCOMMIT=1; +DROP TABLE t2; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2 ( +id1 INT, +id2 INT, +value INT, +PRIMARY KEY (id1), +UNIQUE KEY (id2) +) ENGINE=rocksdb; +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t2 VALUES(1,2,0),(10,20,30); +UPDATE t2 SET value=3 WHERE id2=2; +DELETE FROM t2 WHERE id1=10; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SET AUTOCOMMIT=1; +DROP TABLE t2; diff --git a/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result new file mode 100644 index 000000000000..0592b0992385 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result @@ -0,0 +1,106 @@ +DROP TABLE IF EXISTS t1; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=rocksdb; +START TRANSACTION; +SELECT a FROM t1; +a +connection con2; +BEGIN; +INSERT INTO t1 (a) VALUES(1); +connection con1; +SELECT a FROM t1; +a +connection con2; +INSERT INTO t1 (a) VALUES (2); +connection con1; +SELECT a FROM t1; +a +INSERT INTO t1 (a) SELECT a+100 FROM t1; +SELECT a FROM t1; +a +connection con2; +SELECT a FROM t1; +a +1 +2 +COMMIT; +SELECT a FROM t1; +a +1 +2 +connection con1; +SELECT a FROM t1; +a +INSERT INTO t1 (a) SELECT a+200 FROM t1; +SELECT a FROM t1; +a +201 +202 +COMMIT; +SELECT a FROM t1; +a +1 +2 +201 +202 +connection con2; +SELECT a FROM t1; +a +1 +2 +201 +202 +connection default; +CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=rocksdb; +INSERT INTO t2 (a) VALUES (1); +COMMIT; +connection con1; +BEGIN; +SELECT a from t2; +a +1 +INSERT INTO t2 (a) VALUES (1), (3); +ERROR 23000: Duplicate entry '1' for key 't2.PRIMARY' +connection con2; +INSERT INTO t2 (a) VALUES (2); +COMMIT; +connection con1; +SELECT a from t2; +a +1 +COMMIT; +connection default; +disconnect con1; +disconnect con2; +DROP TABLE t1; +DROP TABLE t2; +CREATE TABLE t3 ( +pk int unsigned PRIMARY KEY, +count int unsigned DEFAULT '0' +) ENGINE=ROCKSDB; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +BEGIN; +SELECT * FROM t3; +pk count +connection con2; +BEGIN; +INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1; +COMMIT; +connection con1; +INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1; +COMMIT; +SELECT count FROM t3; +count +1 +connection default; +disconnect con1; +disconnect con2; +DROP TABLE t3; diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result new file mode 100644 index 000000000000..4a2f99f86fc7 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking.result @@ -0,0 +1,596 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +insert into t1 values (15,15); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +pk a +20 20 +30 30 +rollback; +connection con1; +rollback; +## Test that locks are not released when a statement inside +## a transaction is rolled back +create table t2 ( +pk int, +a int, +primary key (pk) comment 'default', +unique key(a) comment 'default' +) engine=rocksdb; +insert into t2 values (1,1),(2,2); +begin; +insert into t2 values (3,3); +insert into t2 values (10,2); +ERROR 23000: Duplicate entry '2' for key 't2.a' +connection con2; +begin; +select * from t2 where pk=3 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY +rollback; +connection con1; +rollback; +drop table t2; +connection default; +disconnect con1; +disconnect con2; +drop table t1; +# +# Test INFORMATION_SCHEMA.lock_info in range-locking mode +# +connect con1,localhost,root,,; +connection con1; +create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +delete from t1 where pk between 25 and 40; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +$cf_id $trx_id ${indexnr}80000019-${indexnr}80000028:1 X +rollback; +begin; +# The following will show a range lock on 2-9 and also a point lock on 10. +# This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +pk a +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002-${indexnr}80000009:1 X +rollback; +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; +# +# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +primary key(kp1, kp2) comment 'default' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 0 1234 +2 1 1234 +2 2 1234 +2 3 1234 +2 4 1234 +2 5 1234 +2 6 1234 +2 7 1234 +2 8 1234 +2 9 1234 +connection default; +# The lock on kp1=2 should inhibit the following INSERT: +insert into t1 values ( 2,5,9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; +# +# Test that locks on ranges on non-unique secondary keys inhibit +# modifications of the contents of these ranges +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +key(kp1, kp2) comment 'default' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +explain +select * from t1 where kp1=2 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2) +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 3 1234 +2 5 1234 +2 7 1234 +connection default; +begin; +insert into t1 values (2, 9, 9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +delete from t1 where kp1=2 and kp2=5; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=333 where kp1=2 and kp2=3; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=2 where kp1=1 and kp2=8; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# Transaction isolation test +# +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; +# Examine the result: +# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +# (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; +pk a +1 1 +2 2223 +3 3 +disconnect con1; +connection default; +drop table t1; +# +# Same test as above, but check the range scan +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; +# Examine the result: +# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; +pk a +1 1 +2 2 +3 2223 +4 2223 +5 2223 +6 6 +disconnect con1; +connection default; +drop table t1; +# +# Same as above, but test SELECT FOR UPDATE. +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; +# TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +pk a +2 2 +3 3 +select * from t1 where pk=2 for update; +pk a +2 222 +select * from t1 where pk=2 lock in share mode; +pk a +2 222 +select * from t1 where pk=2; +pk a +2 2 +commit; +disconnect con1; +connection default; +drop table t1; +# +# Another no-snapshot-checking test, this time for single-statement +# transaction +# +create table t1 ( +pk int, +a int, +name varchar(16), +primary key(pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1, 'row1'), (2,2,'row2'); +connect con1,localhost,root,,; +connection con1; +select get_lock('row1', 100); +get_lock('row1', 100) +1 +connection default; +# The following will read the first row (1,1,'row1'), and stop. +update t1 set a=a+100 where get_lock(name, 1000)=1; +connection con1; +update t1 set a=5 where pk=2; +select release_lock('row1'); +release_lock('row1') +1 +connection default; +# Look at the row with pk=2: +# 2, 105, row2 - means the UPDATE was reading current data (Correct) +# 2, 102, row - means the UPDATE read the snapshot (incorrect) +select * from t1; +pk a name +1 101 row1 +2 105 row2 +# Try releasing both locks (in 5.6, we will be holding only the second one) +select release_lock(name) from t1; +release_lock(name) +1 +1 +disconnect con1; +connection default; +drop table t1; +# +# Check that I_S.processlist.state is set correctly now. +# +create table t1( +pk int, +a int, +primary key(pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +begin; +select * from t1 where pk=2 for update; +pk a +2 2 +connect con1,localhost,root,,; +begin; +set rocksdb_lock_wait_timeout=300; +select * from t1 where pk=2 for update; +connection default; +# Now, will wait until we see con1 have state="Waiting for row lock" +rollback; +connection con1; +pk a +2 2 +rollback; +disconnect con1; +connection default; +drop table t1; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST +# +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int, +primary key(pk1, pk2) comment 'default' +) engine=rocksdb; +insert into t1 +select +A.a, B.a, A.a*10+B.a +from +t0 A, t0 B; +connect con1,localhost,root,,; +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); +connection default; +begin; +# Should use ref access w/o filesort: +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +3 9 39 +3 8 38 +3 7 37 +3 6 36 +3 5 35 +3 4 34 +3 3 33 +3 2 32 +3 1 31 +3 0 30 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X +rollback; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +# +begin; +# Should use range access with 2 keyparts and w/o filesort: +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +4 8 48 +4 7 47 +4 6 46 +4 5 45 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000480000005-${indexnr}8000000480000008:1 X +rollback; +connection con1; +rollback; +connection default; +drop table t0, t1; +# +# A bug: range locking was not used when scan started at table start or end +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1 select a*2,a*2 from t10; +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +begin; +select * from t1 where pk<10 order by pk limit 10 for update; +pk a +0 0 +2 2 +4 4 +6 6 +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}8000000a X +rollback; +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +pk a +1998 1998 +1996 1996 +1994 1994 +1992 1992 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t10,t1; +# +# Range locking and READ-COMMITTED isolation level +# +connect con1,localhost,root,,; +connection con1; +set session transaction isolation level read committed; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +begin; +select * from t1 where pk between 2 and 5 for update; +pk a +2 NULL +3 NULL +4 NULL +5 NULL +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +begin; +update t1 set a=a+1 where pk between 2 and 5; +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +drop table t1; +disconnect con1; +connection default; +# +# Range Locking and READ-COMMITTED, another test +# +create table t1 ( +pk int, +a int, +b int, +primary key (pk), +key(a) +) engine=rocksdb; +insert into t1 values +(1, 100, 1000), +(2, 200, 2000), +(3, 300, 3000); +set transaction isolation level repeatable read; +begin; +update t1 set b = b + 1 where a > 200; +connect con1,localhost,root,,; +connection con1; +set transaction isolation level read committed; +begin; +insert into t1 values (4, 150, 1500); +insert into t1 values (5, 250, 1500); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a +rollback; +disconnect con1; +connection default; +rollback; +drop table t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_conc_test.result b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result new file mode 100644 index 000000000000..a70152da808f --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result @@ -0,0 +1,4 @@ +set @save_rlwt=@@rocksdb_lock_wait_timeout; +# Run range_locking_conc_test.py +set global rocksdb_lock_wait_timeout= @save_rlwt; +DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result new file mode 100644 index 000000000000..00fd1788dfd6 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result @@ -0,0 +1,453 @@ +set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set @prior_deadlock_detect = @@rocksdb_deadlock_detect; +set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; +set global rocksdb_deadlock_detect = on; +set global rocksdb_lock_wait_timeout = 10000; +# Clears deadlock buffer of any prior deadlocks. +set global rocksdb_max_latest_deadlocks = 0; +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +create table t (i int primary key) engine=rocksdb; +insert into t values (1), (2), (3); +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #1 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #2 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 10; +Deadlock #3 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 1; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set rocksdb_deadlock_detect_depth = 2; +# Range locking code will report deadlocks, because it doesn't honor +# rocksdb_deadlock_detect_depth: +Deadlock #4 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +begin; +select * from t where i=3 for update; +i +3 +select * from t where i=2 for update; +select * from t where i=3 for update; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +deadlocks +true +rollback; +i +3 +rollback; +i +2 +rollback; +set global rocksdb_max_latest_deadlocks = 5; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #6 +create table t1 (id int primary key, value int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5); +begin; +update t1 set value=value+100 where id=1; +update t1 set value=value+100 where id=2; +begin; +update t1 set value=value+200 where id=3; +update t1 set value=value+100 where id=3; +update t1 set value=value+200 where id=1; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +select * from t1; +id value +1 101 +2 102 +3 103 +4 4 +5 5 +drop table t1; +set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout; +set global rocksdb_deadlock_detect = @prior_deadlock_detect; +drop table t; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 0; +# Clears deadlock buffer of any existent deadlocks. +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + diff --git a/mysql-test/suite/rocksdb/r/range_locking_escalation.result b/mysql-test/suite/rocksdb/r/range_locking_escalation.result new file mode 100644 index 000000000000..dd19d728ef24 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_escalation.result @@ -0,0 +1,27 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +show variables like 'rocksdb_max_lock_memory'; +Variable_name Value +rocksdb_max_lock_memory 1024 +show status like 'rocksdb_locktree_escalation_count'; +Variable_name Value +rocksdb_locktree_escalation_count 0 +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 +select +A.a + B.a*10 + C.a*100 + D.a*1000, +12345 +from t0 A, t0 B, t0 C, t0 D; +select count(*) from t1; +count(*) +10000 +show status like 'rocksdb_locktree_escalation_count'; +Variable_name Value +rocksdb_locktree_escalation_count 127 +drop table t0,t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_partial_index.result b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result new file mode 100644 index 000000000000..0c3e85fe8fef --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result @@ -0,0 +1,202 @@ +create table t0(a int primary key) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int not null, +b int, +primary key (pk1, pk2), +key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5' +) engine=rocksdb; +insert into t1 select +1, +A.a, +100 + A.a, +123456 +from t0 A; +select * from t1 force index (key1) where pk1=1; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +insert into t1 select +2, +A.a, +100 + A.a, +123456 +from t0 A limit 3; +insert into t1 select +10000 + A.a +10 *B.a +100*C.a, +A.a, +100 + A.a, +123456 +from t0 A, t0 B, t0 C; +create table t3(pk int primary key); +connect con2,localhost,root,,; +connection con2; +begin; +insert into t3 values(3333333); +connection default; +# +# First, test a query with range lock +# +explain +select * from t1 force index (key1) where pk1>=1 and pk1<=10; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range key1 key1 4 NULL # 100.00 Using index condition +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` FORCE INDEX (`key1`) where ((`test`.`t1`.`pk1` >= 1) and (`test`.`t1`.`pk1` <= 10)) +connect con1,localhost,root,,; +connection con1; +begin; +# Allocate a snapshot +select * from t0 where a=3; +a +3 +connection default; +# Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); +connection con1; +# This doesn't see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +# This DOES see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +1 11 99999 99999 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +2 11 99999 99999 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}8000000a:1 X +$cf_id $trx_id ${indexnr}8000000180000000 X +$cf_id $trx_id ${indexnr}8000000180000001 X +$cf_id $trx_id ${indexnr}8000000180000002 X +$cf_id $trx_id ${indexnr}8000000180000003 X +$cf_id $trx_id ${indexnr}8000000180000004 X +$cf_id $trx_id ${indexnr}8000000180000005 X +$cf_id $trx_id ${indexnr}8000000180000006 X +$cf_id $trx_id ${indexnr}8000000180000007 X +$cf_id $trx_id ${indexnr}8000000180000008 X +$cf_id $trx_id ${indexnr}8000000180000009 X +$cf_id $trx_id ${indexnr}800000018000000b X +$cf_id $trx_id ${indexnr}8000000280000000 X +$cf_id $trx_id ${indexnr}8000000280000001 X +$cf_id $trx_id ${indexnr}8000000280000002 X +$cf_id $trx_id ${indexnr}800000028000000b X +rollback; +# +# Now, test a query with LockingIterator +# +delete from t1 where b=99999; +begin; +# Allocate a snapshot +select * from t0 where a=3; +a +3 +connection default; +# Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); +connection con1; +# This doesn't see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +10000 0 100 123456 +10001 1 101 123456 +# This DOES see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +1 11 99999 99999 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +2 11 99999 99999 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf2_id $trx_id ${indexnr2}80000001 X +$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}800000018000006480000000 X +$cf2_id $trx_id ${indexnr2}800000018000006480000000-${indexnr2}800000018000006580000001 X +$cf2_id $trx_id ${indexnr2}800000018000006580000001-${indexnr2}800000018000006680000002 X +$cf2_id $trx_id ${indexnr2}800000018000006680000002-${indexnr2}800000018000006780000003 X +$cf2_id $trx_id ${indexnr2}800000018000006780000003-${indexnr2}800000018000006880000004 X +$cf2_id $trx_id ${indexnr2}800000018000006880000004-${indexnr2}800000018000006980000005 X +$cf2_id $trx_id ${indexnr2}800000018000006980000005-${indexnr2}800000018000006a80000006 X +$cf2_id $trx_id ${indexnr2}800000018000006a80000006-${indexnr2}800000018000006b80000007 X +$cf2_id $trx_id ${indexnr2}800000018000006b80000007-${indexnr2}800000018000006c80000008 X +$cf2_id $trx_id ${indexnr2}800000018000006c80000008-${indexnr2}800000018000006d80000009 X +$cf2_id $trx_id ${indexnr2}800000018000006d80000009-${indexnr2}800000018001869f8000000b X +$cf2_id $trx_id ${indexnr2}800000018001869f8000000b-${indexnr2+1} X +$cf_id $trx_id ${indexnr}8000000180000000 X +$cf_id $trx_id ${indexnr}8000000180000001 X +$cf_id $trx_id ${indexnr}8000000180000002 X +$cf_id $trx_id ${indexnr}8000000180000003 X +$cf_id $trx_id ${indexnr}8000000180000004 X +$cf_id $trx_id ${indexnr}8000000180000005 X +$cf_id $trx_id ${indexnr}8000000180000006 X +$cf_id $trx_id ${indexnr}8000000180000007 X +$cf_id $trx_id ${indexnr}8000000180000008 X +$cf_id $trx_id ${indexnr}8000000180000009 X +$cf_id $trx_id ${indexnr}800000018000000b X +$cf_id $trx_id ${indexnr}80000002-${indexnr}8000271080000000 X +rollback; +disconnect con1; +connection default; +disconnect con2; +drop table t0, t1,t3; diff --git a/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result new file mode 100644 index 000000000000..1067087e8165 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result @@ -0,0 +1,50 @@ +select @@rocksdb_use_range_locking; +@@rocksdb_use_range_locking +1 +set debug_sync='RESET'; +create table ten(a int primary key); +insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table one_k(a int primary key); +insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C; +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 select a,a from ten; +insert into t1 select a+40, a+40 from ten; +insert into t1 select a+100, a+100 from one_k; +delete from t1 where pk=44; +set global rocksdb_force_flush_memtable_and_lzero_now=1; +begin; +set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont'; +update t1 set a=a+100 where pk < 3 or pk between 10 and 50; +set debug_sync='now WAIT_FOR con1_stopped'; +insert into t1 values (44,5000); +delete from t1 where pk= 42; +update t1 set a=5000 where pk between 40 and 45; +set global rocksdb_force_flush_memtable_and_lzero_now=1; +set debug_sync='now SIGNAL con1_cont'; +select * from t1 where pk<100; +pk a +0 100 +1 101 +2 102 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +40 5100 +41 5100 +43 5100 +44 5100 +45 5100 +46 146 +47 147 +48 148 +49 149 +commit; +set debug_sync='RESET'; +drop table t1, ten, one_k; diff --git a/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result new file mode 100644 index 000000000000..e39c6bb3339d --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result @@ -0,0 +1,556 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +insert into t1 values (15,15); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +pk a +20 20 +30 30 +rollback; +connection con1; +rollback; +## Test that locks are not released when a statement inside +## a transaction is rolled back +create table t2 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1', +unique key(a) comment '' +) engine=rocksdb; +insert into t2 values (1,1),(2,2); +begin; +insert into t2 values (3,3); +insert into t2 values (10,2); +ERROR 23000: Duplicate entry '2' for key 't2.a' +connection con2; +begin; +select * from t2 where pk=3 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY +rollback; +connection con1; +rollback; +drop table t2; +connection default; +disconnect con1; +disconnect con2; +drop table t1; +# +# Test INFORMATION_SCHEMA.lock_info in range-locking mode +# +connect con1,localhost,root,,; +connection con1; +create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +delete from t1 where pk between 25 and 40; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +$cf_id $trx_id ${indexnr}80000028-${indexnr}80000019:1 X +rollback; +begin; +# The following will show a range lock on 2-9 and also a point lock on 10. +# This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +pk a +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000009-${indexnr}80000002:1 X +rollback; +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; +# +# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +primary key(kp1, kp2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 0 1234 +2 1 1234 +2 2 1234 +2 3 1234 +2 4 1234 +2 5 1234 +2 6 1234 +2 7 1234 +2 8 1234 +2 9 1234 +connection default; +# The lock on kp1=2 should inhibit the following INSERT: +insert into t1 values ( 2,5,9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; +# +# Test that locks on ranges on non-unique secondary keys inhibit +# modifications of the contents of these ranges +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +key(kp1, kp2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +explain +select * from t1 where kp1=2 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2) +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 3 1234 +2 5 1234 +2 7 1234 +connection default; +begin; +insert into t1 values (2, 9, 9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +delete from t1 where kp1=2 and kp2=5; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=333 where kp1=2 and kp2=3; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=2 where kp1=1 and kp2=8; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# Transaction isolation test +# +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; +# Examine the result: +# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +# (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; +pk a +1 1 +2 2223 +3 3 +disconnect con1; +connection default; +drop table t1; +# +# Same test as above, but check the range scan +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +6 6 +5 5 +4 4 +3 3 +2 2 +1 1 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; +# Examine the result: +# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; +pk a +6 6 +5 2223 +4 2223 +3 2223 +2 2 +1 1 +disconnect con1; +connection default; +drop table t1; +# +# Same as above, but test SELECT FOR UPDATE. +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +6 6 +5 5 +4 4 +3 3 +2 2 +1 1 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; +# TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +pk a +2 2 +3 3 +select * from t1 where pk=2 for update; +pk a +2 222 +select * from t1 where pk=2 lock in share mode; +pk a +2 222 +select * from t1 where pk=2; +pk a +2 2 +commit; +disconnect con1; +connection default; +drop table t1; +# +# Check that I_S.processlist.state is set correctly now. +# +create table t1( +pk int, +a int, +primary key(pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +begin; +select * from t1 where pk=2 for update; +pk a +2 2 +connect con1,localhost,root,,; +begin; +set rocksdb_lock_wait_timeout=300; +select * from t1 where pk=2 for update; +connection default; +# Now, will wait until we see con1 have state="Waiting for row lock" +rollback; +connection con1; +pk a +2 2 +rollback; +disconnect con1; +connection default; +drop table t1; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST +# +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int, +primary key(pk1, pk2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 +select +A.a, B.a, A.a*10+B.a +from +t0 A, t0 B; +connect con1,localhost,root,,; +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); +connection default; +begin; +# Should use ref access w/o filesort: +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +3 9 39 +3 8 38 +3 7 37 +3 6 36 +3 5 35 +3 4 34 +3 3 33 +3 2 32 +3 1 31 +3 0 30 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X +rollback; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +# +begin; +# Should use range access with 2 keyparts and w/o filesort: +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +4 8 48 +4 7 47 +4 6 46 +4 5 45 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000480000008-${indexnr}8000000480000005:1 X +rollback; +connection con1; +rollback; +connection default; +drop table t0, t1; +# +# A bug: range locking was not used when scan started at table start or end +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1 select a*2,a*2 from t10; +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +begin; +select * from t1 where pk<10 order by pk limit 10 for update; +pk a +0 0 +2 2 +4 4 +6 6 +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}8000000a X +rollback; +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +pk a +1998 1998 +1996 1996 +1994 1994 +1992 1992 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t10,t1; +# +# Range locking and READ-COMMITTED isolation level +# +connect con1,localhost,root,,; +connection con1; +set session transaction isolation level read committed; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +begin; +select * from t1 where pk between 2 and 5 for update; +pk a +2 NULL +3 NULL +4 NULL +5 NULL +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +begin; +update t1 set a=a+1 where pk between 2 and 5; +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +drop table t1; +disconnect con1; +connection default; +# +# Range Locking and READ-COMMITTED, another test +# +create table t1 ( +pk int, +a int, +b int, +primary key (pk), +key(a) +) engine=rocksdb; +insert into t1 values +(1, 100, 1000), +(2, 200, 2000), +(3, 300, 3000); +set transaction isolation level repeatable read; +begin; +update t1 set b = b + 1 where a > 200; +connect con1,localhost,root,,; +connection con1; +set transaction isolation level read committed; +begin; +insert into t1 values (4, 150, 1500); +insert into t1 values (5, 250, 1500); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a +rollback; +disconnect con1; +connection default; +rollback; +drop table t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result new file mode 100644 index 000000000000..bf1cb916a5b6 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result @@ -0,0 +1,291 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) +) engine=rocksdb; +insert into t1 select +A.a + B.a*10 + C.a*100, +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +# Make another connection to get the lock tree out of the STO-mode +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +connection default; +begin; +select * from t1 where pk=11 for update; +pk a +11 11 +# Now, we will just see locks on 10=0xA and 11=0xB: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000b X +# +# SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT +# +explain +select * from t1 where pk>=500 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 500) order by `test`.`t1`.`pk` limit 3 +select * from t1 where pk>=500 order by pk limit 3 for update; +pk a +500 500 +501 501 +502 502 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000b X +$cf_id $trx_id ${indexnr}800001f4-${indexnr}800001f6 X +rollback; +begin; +select * from t1 where pk=11 for update; +pk a +11 11 +explain +select * from t1 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL 3 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` order by `test`.`t1`.`pk` limit 3 +select * from t1 order by pk limit 3 for update; +pk a +0 0 +1 1 +2 2 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}80000002 X +$cf_id $trx_id ${indexnr}8000000b X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0, t1; +# +# Concurrent tests: let one thread do SeekForUpdate and the other +# interfere by committing modifications +# +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) +) engine=rocksdb; +insert into t1 select +A.a + B.a*10 + C.a*100, +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +select * from t1 where pk<10; +pk a +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +delete from t1 where pk<10; +select * from t1 where pk<10; +pk a +# Test what happens when another transaction commits a row +# right before the range we are about to lock (nothing) +explain +select * from t1 where pk >=5 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 3 +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 3 for update; +connect con1,localhost,root,,; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (3,3); +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +10 10 +11 11 +12 12 +rollback; +delete from t1 where pk=3; +# +# Now, repeat the test but let the other transaction insert the row into +# the range we are locking +explain +select * from t1 where pk >=5 order by pk limit 1 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 1 +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (8,8); +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000005-${indexnr}8000000a X +rollback; +delete from t1 where pk=8; +# +# Repeat the third time, this time deleting the row that SeekForUpdate saw +# +insert into t1 values (7,7); +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +10 10 +rollback; +# +# Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT +# error. MyRocks code should now be prepared that data reads cause this +# error +# +insert into t1 values (7,7); +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +begin; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +# +# Test the thd_killed check in the iterator +# +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +kill query CONN_ID; +connection default; +ERROR HY000: Got error 10 'Operation aborted: ' from ROCKSDB +rollback; +# +# Backward scan test +# +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +insert into t1 values +(1001, 1001), +(1005, 1005), +(1007, 1007), +(1010, 1010); +begin; +select * from t1 order by pk desc limit 2 for update; +pk a +1010 1010 +1007 1007 +# The below will lock from pk=1007 (0x3ef) till the end of the table: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800003ef-${indexnr+1} X +rollback; +begin; +select * from t1 where pk <1007 order by pk desc limit 2 for update; +pk a +1005 1005 +1001 1001 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800003e9-${indexnr}800003ef X +connection con1; +rollback; +connection default; +rollback; +# +# Backward scan test 2: error condition +# +connection con1; +begin; +select * from t1 where pk=1010 for update; +pk a +1010 1010 +connection default; +begin; +select * from t1 order by pk desc limit 2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +begin; +select * from t1 where pk=1007 for update; +pk a +1007 1007 +connection default; +begin; +select * from t1 order by pk desc limit 2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# A test: full table scan doesn't lock gaps +# +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 values (10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +connection con1; +begin; +select * from t1 for update; +pk a +10 10 +20 20 +30 30 +connection con2; +insert into t1 values (5,5); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +disconnect con1; +disconnect con2; +connection default; +drop table t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result new file mode 100644 index 000000000000..2afb2eea5895 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result @@ -0,0 +1,142 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rlsfu_test' +) engine=rocksdb; +insert into t1 (pk) +select +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +delete from t1 where pk<100; +connect con1,localhost,root,,; +connection con1; +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 5 for update; +connection default; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 (pk) values +(10),(20),(30),(40),(50); +set debug_sync='now SIGNAL spoiler_inserted'; +connection con1; +pk a +10 NULL +20 NULL +30 NULL +40 NULL +50 NULL +# This must return 1, no 5: +select lock_count from information_schema.rocksdb_trx +where thread_id=CONNECTION_ID(); +lock_count +1 +rollback; +disconnect con1; +connection default; +drop table t0, t1; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=off; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=on; diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result new file mode 100644 index 000000000000..fef9e0ef6a0b --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result @@ -0,0 +1,142 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:rlsfu_test' +) engine=rocksdb; +insert into t1 (pk) +select +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +delete from t1 where pk<100; +connect con1,localhost,root,,; +connection con1; +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 5 for update; +connection default; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 (pk) values +(10),(20),(30),(40),(50); +set debug_sync='now SIGNAL spoiler_inserted'; +connection con1; +pk a +10 NULL +20 NULL +30 NULL +40 NULL +50 NULL +# This must return 1, no 5: +select lock_count from information_schema.rocksdb_trx +where thread_id=CONNECTION_ID(); +lock_count +1 +rollback; +disconnect con1; +connection default; +drop table t0, t1; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rev:rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=off; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rev:rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=on; diff --git a/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result new file mode 100644 index 000000000000..580108de6f6b --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result @@ -0,0 +1,251 @@ +select @@rocksdb_use_range_locking; +@@rocksdb_use_range_locking +1 +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 select a,a from t0; +# A basic test for shared locks +begin; +select * from t1 where pk=3 for update; +pk a +3 3 +select * from t1 where pk=5 lock in share mode; +pk a +5 5 +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where pk=5 lock in share mode; +pk a +5 5 +# Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +$cf_id $TRX2_ID ${indexnr}80000005 S +rollback; +# Now, TRX2_ID should be gone: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +connection default; +# Get a read lock on pk=3 (where we have a write lock). +# The result should be that we will still have a write lock +select * from t1 where pk=3 for update; +pk a +3 3 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +# Get a write lock on pk=5 (where we have a read lock). +# The result should be that we will have a write lock. +select * from t1 where pk=5 for update; +pk a +5 5 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 X +connection default; +rollback; +# +# Test if a read lock inhibits write locks +# +begin; +select * from t1 where pk=2 lock in share mode; +pk a +2 2 +select * from t1 where pk=8 for update; +pk a +8 8 +connection con1; +begin; +select * from t1 where pk=2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +select * from t1 where pk between 0 and 4 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +delete from t1 where pk=2; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +# Get a shared lock +select * from t1 where pk=2 lock in share mode; +pk a +2 2 +# But this should still prevent us from acquiring a write lock on that value: +select * from t1 where pk=2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection default; +rollback; +drop table t1; +create table t1 ( +pk int not null primary key, +a int not null, +key(a) +) engine=rocksdb; +insert into t1 +select +A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a +from +t0 A, t0 B, t0 C, t0 D; +set global rocksdb_force_flush_memtable_now=1; +connection con1; +begin; +select * from t1 where pk=900 for update; +pk a +900 900 +connection default; +begin; +explain +select * from t1 where a between 2 and 5 lock in share mode; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range a a 4 NULL # 100.00 Using where; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` between 2 and 5) +select * from t1 where a between 2 and 5 lock in share mode; +pk a +2 2 +3 3 +4 4 +5 5 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr+1}80000002-${indexnr+1}80000005:1 X +$cf_id $TRX1_ID ${indexnr}80000002 S +$cf_id $TRX1_ID ${indexnr}80000003 S +$cf_id $TRX1_ID ${indexnr}80000004 S +$cf_id $TRX1_ID ${indexnr}80000005 S +$cf_id $TRX1_ID ${indexnr}80000006 S +$cf_id $TRX2_ID ${indexnr}80000384 X +rollback; +disconnect con1; +drop table t0,t1; +# +# Test shared point locks and lock escalation +# +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 +select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C; +show status like 'rocksdb_locktree_current_lock_memory'; +Variable_name Value +rocksdb_locktree_current_lock_memory 0 +connect con1,localhost,root,,; +connection con1; +begin; +# CON1: get some shared locks +select * from t1 where pk=1001 lock in share mode; +pk a +1001 12345 +select * from t1 where pk=1100 lock in share mode; +pk a +1100 12345 +select * from t1 where pk=1200 lock in share mode; +pk a +1200 12345 +select * from t1 where pk=2500 lock in share mode; +pk a +connection default; +begin; +# DEFAULT: get the same locks so we have locks with multiple owners +select * from t1 where pk=1001 lock in share mode; +pk a +1001 12345 +select * from t1 where pk=1100 lock in share mode; +pk a +1100 12345 +select * from t1 where pk=1200 lock in share mode; +pk a +1200 12345 +# DEFAULT: get shared locks with one owner: +select * from t1 where pk=2510 lock in share mode; +pk a +# DEFAULT: exclusive locks on 0-10: +insert into t1 select A.a, 0 from t0 A; +connection con1; +# CON1: exclusive locks on 2000-2010: +insert into t1 select 2000+A.a, 0 from t0 A; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX2_ID ${indexnr}80000000 X +$cf_id $TRX2_ID ${indexnr}80000001 X +$cf_id $TRX2_ID ${indexnr}80000002 X +$cf_id $TRX2_ID ${indexnr}80000003 X +$cf_id $TRX2_ID ${indexnr}80000004 X +$cf_id $TRX2_ID ${indexnr}80000005 X +$cf_id $TRX2_ID ${indexnr}80000006 X +$cf_id $TRX2_ID ${indexnr}80000007 X +$cf_id $TRX2_ID ${indexnr}80000008 X +$cf_id $TRX2_ID ${indexnr}80000009 X +$cf_id $TRX1_ID ${indexnr}800003e9 S +$cf_id $TRX2_ID ${indexnr}800003e9 S +$cf_id $TRX1_ID ${indexnr}8000044c S +$cf_id $TRX2_ID ${indexnr}8000044c S +$cf_id $TRX1_ID ${indexnr}800004b0 S +$cf_id $TRX2_ID ${indexnr}800004b0 S +$cf_id $TRX1_ID ${indexnr}800007d0 X +$cf_id $TRX1_ID ${indexnr}800007d1 X +$cf_id $TRX1_ID ${indexnr}800007d2 X +$cf_id $TRX1_ID ${indexnr}800007d3 X +$cf_id $TRX1_ID ${indexnr}800007d4 X +$cf_id $TRX1_ID ${indexnr}800007d5 X +$cf_id $TRX1_ID ${indexnr}800007d6 X +$cf_id $TRX1_ID ${indexnr}800007d7 X +$cf_id $TRX1_ID ${indexnr}800007d8 X +$cf_id $TRX1_ID ${indexnr}800007d9 X +$cf_id $TRX1_ID ${indexnr}800009c4 S +$cf_id $TRX2_ID ${indexnr}800009ce S +connection default; +show status like 'rocksdb_locktree_current_lock_memory'; +Variable_name Value +rocksdb_locktree_current_lock_memory 8792 +set @save_mlm= @@rocksdb_max_lock_memory; +# Set the limit to cause lock escalation: +set @cur_mem_usage= (select +variable_value +from +performance_schema.global_status +where +variable_name='rocksdb_locktree_current_lock_memory'); +set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED); +connection con1; +insert into t1 select 3000+A.a, 0 from t0 A; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX2_ID ${indexnr}80000000-${indexnr}80000009 X +$cf_id $TRX1_ID ${indexnr}800003e9 S +$cf_id $TRX2_ID ${indexnr}800003e9 S +$cf_id $TRX1_ID ${indexnr}8000044c S +$cf_id $TRX2_ID ${indexnr}8000044c S +$cf_id $TRX1_ID ${indexnr}800004b0 S +$cf_id $TRX2_ID ${indexnr}800004b0 S +$cf_id $TRX1_ID ${indexnr}800007d0-${indexnr}800007d9 X +$cf_id $TRX1_ID ${indexnr}800009c4 S +$cf_id $TRX2_ID ${indexnr}800009ce S +$cf_id $TRX1_ID ${indexnr}80000bb8 X +$cf_id $TRX1_ID ${indexnr}80000bb9 X +$cf_id $TRX1_ID ${indexnr}80000bba X +$cf_id $TRX1_ID ${indexnr}80000bbb X +$cf_id $TRX1_ID ${indexnr}80000bbc X +$cf_id $TRX1_ID ${indexnr}80000bbd X +$cf_id $TRX1_ID ${indexnr}80000bbe X +$cf_id $TRX1_ID ${indexnr}80000bbf X +$cf_id $TRX1_ID ${indexnr}80000bc0 X +$cf_id $TRX1_ID ${indexnr}80000bc1 X +connection con1; +rollback; +connection default; +rollback; +disconnect con1; +set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED); +drop table t0, t1; diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result index 15286c6a9f5a..ce4195e8d92b 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb.result +++ b/mysql-test/suite/rocksdb/r/rocksdb.result @@ -990,6 +990,7 @@ rocksdb_max_background_jobs 2 rocksdb_max_bottom_pri_background_compactions 0 rocksdb_max_compaction_history 64 rocksdb_max_latest_deadlocks 5 +rocksdb_max_lock_memory 1073741824 rocksdb_max_log_file_size 0 rocksdb_max_manifest_file_size 1073741824 rocksdb_max_manual_compactions 10 @@ -1058,6 +1059,8 @@ rocksdb_use_default_sk_cf OFF rocksdb_use_direct_io_for_flush_and_compaction OFF rocksdb_use_direct_reads OFF rocksdb_use_fsync OFF +rocksdb_use_range_lock_manager_as_point OFF +rocksdb_use_range_locking OFF rocksdb_validate_tables 1 rocksdb_verify_row_debug_checksums OFF rocksdb_wal_bytes_per_sync 0 diff --git a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result index 58329d03ebc9..71e6ff4d30b2 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result +++ b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result @@ -72,7 +72,7 @@ update t1 set c2=100 where c1=3; delete from t1 where c1 <= 2; include/sync_slave_sql_with_master.inc [connection slave] -select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; +select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; read_free false select * from t1; diff --git a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result index 1e253a9974b3..08a0a2f59426 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result +++ b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result @@ -36,6 +36,9 @@ rocksdb_rollback_on_timeout OFF begin work; insert into t1 values (9); insert into t1 values (10); +# Fix for Range Locking: force a snapshot to be taken: +select * from t1 where a=100; +a update t1 set a = a + 1 where a = 2; begin work; insert into t1 values (11); diff --git a/mysql-test/suite/rocksdb/r/select_count_for_update.result b/mysql-test/suite/rocksdb/r/select_count_for_update.result index 1107aa2f6cb0..6672d43eb438 100644 --- a/mysql-test/suite/rocksdb/r/select_count_for_update.result +++ b/mysql-test/suite/rocksdb/r/select_count_for_update.result @@ -35,9 +35,9 @@ SELECT COUNT(*) FROM t1 FORCE INDEX (sk); COUNT(*) 3 SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE; -ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE; -ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX connection con1; COMMIT; SELECT COUNT(*) FROM t1 FORCE INDEX (sk); diff --git a/mysql-test/suite/rocksdb/r/trx_info.result b/mysql-test/suite/rocksdb/r/trx_info.result index ada2e127021e..562c1d7022ed 100644 --- a/mysql-test/suite/rocksdb/r/trx_info.result +++ b/mysql-test/suite/rocksdb/r/trx_info.result @@ -9,5 +9,10 @@ a 2 select * from information_schema.rocksdb_trx; TRANSACTION_ID STATE NAME WRITE_COUNT LOCK_COUNT TIMEOUT_SEC WAITING_KEY WAITING_COLUMN_FAMILY_ID IS_REPLICATION SKIP_TRX_API READ_ONLY HAS_DEADLOCK_DETECTION NUM_ONGOING_BULKLOAD THREAD_ID QUERY -_TRX_ID_ STARTED _NAME_ 0 2 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx +_TRX_ID_ STARTED _NAME_ 0 2_or_3 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx +select +if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT +from information_schema.rocksdb_trx; +LOCK_COUNT_IS_CORRECT +1 DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/r/unique_sec.result b/mysql-test/suite/rocksdb/r/unique_sec.result index 1da78db24b1c..d4ef2e0ff2ee 100644 --- a/mysql-test/suite/rocksdb/r/unique_sec.result +++ b/mysql-test/suite/rocksdb/r/unique_sec.result @@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5' UPDATE t1 SET id5=34 WHERE id1=38; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5 # NULL values are unique +# (Note: the following UPDATE reads through the whole table without +# finding anything to update. With point locking, this is fine, +# but with range locking it will time out while waiting on a row lock +# that the other transaction is holding) UPDATE t1 SET id5=NULL WHERE value1 > 37; COMMIT; COMMIT; diff --git a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result index d6d06f6ece5a..0e71e6481aa7 100644 --- a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result +++ b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result @@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5' UPDATE t1 SET id5=34 WHERE id1=38; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5 # NULL values are unique +# (Note: the following UPDATE reads through the whole table without +# finding anything to update. With point locking, this is fine, +# but with range locking it will time out while waiting on a row lock +# that the other transaction is holding) UPDATE t1 SET id5=NULL WHERE value1 > 37; COMMIT; COMMIT; diff --git a/mysql-test/suite/rocksdb/t/deadlock_tracking.test b/mysql-test/suite/rocksdb/t/deadlock_tracking.test index 42e46bb0f28f..55e6502c079b 100644 --- a/mysql-test/suite/rocksdb/t/deadlock_tracking.test +++ b/mysql-test/suite/rocksdb/t/deadlock_tracking.test @@ -1,3 +1,9 @@ +# Deadlock #5 uses SELECT ... LOCK IN SHARE MODE; +# SHOW ENGINE ROCKSDB TRANSACTION status prints information about deadlocks. +# A part of this test that works with range locking is in +# range_locking_deadlock_tracking.test +--source suite/rocksdb/include/not_range_locking.inc + set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; set @prior_deadlock_detect = @@rocksdb_deadlock_detect; set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; @@ -137,7 +143,6 @@ rollback; connection default; --replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ show engine rocksdb transaction status; - echo Deadlock #6; connection con1; create table t1 (id int primary key, value int) engine=rocksdb; diff --git a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test index f7eb8151f40d..05ae30f2dddd 100644 --- a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test +++ b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test @@ -3,6 +3,10 @@ --source include/have_rocksdb.inc --source include/count_sessions.inc +# Doesn't work with range locking because range locking +# does not provide info in rocksdb_deadlock. +--source suite/rocksdb/include/not_range_locking.inc + --disable_query_log call mtr.add_suppression("Column family '[a-z_]+' not found"); --enable_query_log diff --git a/mysql-test/suite/rocksdb/t/hermitage-range_locking.test b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test new file mode 100644 index 000000000000..55203af9cf8d --- /dev/null +++ b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test @@ -0,0 +1,15 @@ +--source include/have_rocksdb.inc + +# Range locking uses InnoDB-like transaction isolation, which +# means the results differ from "true" Repeatable Read. +--source suite/rocksdb/include/have_range_locking.inc + + +# Hermitage is an attempt to test transaction isolation levels. +# https://github.com/ept/hermitage + +let $trx_isolation = READ COMMITTED; +--source hermitage.inc + +let $trx_isolation = REPEATABLE READ; +--source hermitage.inc diff --git a/mysql-test/suite/rocksdb/t/hermitage.inc b/mysql-test/suite/rocksdb/t/hermitage.inc index 90f7d4825331..83815a70459a 100644 --- a/mysql-test/suite/rocksdb/t/hermitage.inc +++ b/mysql-test/suite/rocksdb/t/hermitage.inc @@ -108,6 +108,8 @@ select * from test where value % 3 = 0; commit; --source hermitage_init.inc +let $RC_OR_RANGE_LOCKING=`select @@tx_isolation='READ-COMMITTED' OR @@rocksdb_use_range_locking=1`; +let $RR_AND_NOT_RANGE_LOCKING=`select @@tx_isolation='REPEATABLE-READ' AND @@rocksdb_use_range_locking=0`; connection con1; update test set value = value + 10; connection con2; @@ -117,13 +119,13 @@ send delete from test where value = 20; connection con1; commit; connection con2; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { reap; # RC: Returns 2 => 30 select * from test; } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { --error ER_LOCK_DEADLOCK reap; @@ -147,13 +149,13 @@ send update test set value = 12 where id = 1; connection con1; commit; connection con2; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { reap; # RC: Returns 1 => 12 select * from test; } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { --error ER_LOCK_DEADLOCK reap; @@ -200,12 +202,12 @@ update test set value = 12 where id = 1; update test set value = 18 where id = 2; commit; connection con1; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { delete from test where value = 20; # doesn't delete anything select * from test where id = 2; # shows 2 => 18 } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { --error ER_LOCK_DEADLOCK delete from test where value = 20; diff --git a/mysql-test/suite/rocksdb/t/hermitage.test b/mysql-test/suite/rocksdb/t/hermitage.test index e4138e8d89fe..51f3f286a0e0 100644 --- a/mysql-test/suite/rocksdb/t/hermitage.test +++ b/mysql-test/suite/rocksdb/t/hermitage.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# See hermitage-range_locking variant +--source suite/rocksdb/include/not_range_locking.inc + # Hermitage is an attempt to test transaction isolation levels. # https://github.com/ept/hermitage diff --git a/mysql-test/suite/rocksdb/t/i_s_deadlock.test b/mysql-test/suite/rocksdb/t/i_s_deadlock.test index e0479d6a3370..82fa9fc6bbdc 100644 --- a/mysql-test/suite/rocksdb/t/i_s_deadlock.test +++ b/mysql-test/suite/rocksdb/t/i_s_deadlock.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +# Uses LOCK IN SHARE MODE and so will hang in range-locking mode. The part that +# doesn't hang is in rocksdb.range_locking_i_s_deadlock.test +--source suite/rocksdb/include/not_range_locking.inc + set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; set @prior_deadlock_detect = @@rocksdb_deadlock_detect; set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; diff --git a/mysql-test/suite/rocksdb/t/issue111.test b/mysql-test/suite/rocksdb/t/issue111.test index 671ea4708d60..3657e977a704 100644 --- a/mysql-test/suite/rocksdb/t/issue111.test +++ b/mysql-test/suite/rocksdb/t/issue111.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +# The testcase here assumes key tracking is present +# (and range locking uses InnoDB-like approach, "DMLs use Read Commited") +--source suite/rocksdb/include/not_range_locking.inc + connect (con2,localhost,root,,); connection default; diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test new file mode 100644 index 000000000000..465fb9099dac --- /dev/null +++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test @@ -0,0 +1,10 @@ +# +# A range-locking variant of issue243_transactionStatus.test + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +let $forced_range_locking=1; +--source issue243_transactionStatus.test + + diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test index 1e2f0b41226b..5c1948ebe817 100644 --- a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test +++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +if (!$forced_range_locking) { +--source suite/rocksdb/include/not_range_locking.inc +} + --disable_warnings DROP TABLE IF EXISTS t1; --enable_warnings diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test new file mode 100644 index 000000000000..6c42c7be12cf --- /dev/null +++ b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test @@ -0,0 +1,9 @@ +--source include/have_rocksdb.inc + +# Range locking uses InnoDB-like transaction isolation, which +# means the results differ from "true" Repeatable Read. +--source suite/rocksdb/include/have_range_locking.inc + +let $trx_isolation = REPEATABLE READ; +--source transaction_isolation.inc + diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read.test b/mysql-test/suite/rocksdb/t/level_repeatable_read.test index cf29073f69ec..b81dcf31ab1b 100644 --- a/mysql-test/suite/rocksdb/t/level_repeatable_read.test +++ b/mysql-test/suite/rocksdb/t/level_repeatable_read.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# See level_repeatable_read-range_locking variant +--source suite/rocksdb/include/not_range_locking.inc + let $trx_isolation = REPEATABLE READ; --source transaction_isolation.inc diff --git a/mysql-test/suite/rocksdb/t/lock_info.test b/mysql-test/suite/rocksdb/t/lock_info.test index 1b624cf38c05..a277c1b8d8de 100644 --- a/mysql-test/suite/rocksdb/t/lock_info.test +++ b/mysql-test/suite/rocksdb/t/lock_info.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# Range Locking supports I_S.lock_info but its printout is different (see range_locking.test) +--source suite/rocksdb/include/not_range_locking.inc + --disable_warnings DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; diff --git a/mysql-test/suite/rocksdb/t/locking_issues.test b/mysql-test/suite/rocksdb/t/locking_issues.test index 035046ae368d..95a6676f78a2 100644 --- a/mysql-test/suite/rocksdb/t/locking_issues.test +++ b/mysql-test/suite/rocksdb/t/locking_issues.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# A lot of tests below assume point locking, not range. +--source suite/rocksdb/include/not_range_locking.inc + let $isolation_level = REPEATABLE READ; --source suite/rocksdb/include/locking_issues_case1_1.inc diff --git a/mysql-test/suite/rocksdb/t/max_row_locks.test b/mysql-test/suite/rocksdb/t/max_row_locks.test index 4b07f3d84928..d4b2604f1e30 100644 --- a/mysql-test/suite/rocksdb/t/max_row_locks.test +++ b/mysql-test/suite/rocksdb/t/max_row_locks.test @@ -1,4 +1,5 @@ --source include/have_rocksdb.inc +--source suite/rocksdb/include/not_range_locking.inc create table t1 (id1 bigint, id2 bigint, c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, primary key (id1, id2), index i(c1, c2)); --disable_query_log diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc new file mode 100644 index 000000000000..b3ec0ae7367b --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking.inc @@ -0,0 +1,612 @@ +# +# Range locking tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +--enable_connect_log + + +show variables like 'rocksdb_use_range_locking'; + +eval create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 values +(10,10),(20,20),(30,30); + +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +--echo ### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; + +connection con2; +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (15,15); + +connection con1; +rollback; + +--echo ## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; + +connection con2; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk between 15 and 35 for update; +rollback; + +connection con1; +rollback; + +--echo ## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; + +connection con2; +begin; +# This must not block +select * from t1 where pk between 15 and 35 for update; +rollback; + +connection con1; +rollback; + +--echo ## Test that locks are not released when a statement inside +--echo ## a transaction is rolled back +eval +create table t2 ( + pk int, + a int, + primary key (pk) comment '$pk_cf', + unique key(a) comment '$sk_cf' +) engine=rocksdb; + +insert into t2 values (1,1),(2,2); + +begin; +insert into t2 values (3,3); +--error ER_DUP_ENTRY +insert into t2 values (10,2); + +connection con2; +begin; +# This must time out: +--error ER_LOCK_WAIT_TIMEOUT +select * from t2 where pk=3 for update; + +rollback; +connection con1; +rollback; +drop table t2; + +# cleanup +connection default; +disconnect con1; +disconnect con2; +drop table t1; + +--echo # +--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode +--echo # + +connect (con1,localhost,root,,); +connection con1; +eval create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; + + +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 values +(10,10),(20,20),(30,30); + +begin; +select * from t1 where pk=10 for update; + +#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +delete from t1 where pk between 25 and 40; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +begin; +--echo # The following will show a range lock on 2-9 and also a point lock on 10. +--echo # This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; + +--echo # +--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +--echo # + +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +eval +create table t1 ( + kp1 int not null, + kp2 int not null, + a int, + primary key(kp1, kp2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; + +connect (con1,localhost,root,,); +connection con1; + +begin; +select * from t1 where kp1=2 for update; + +connection default; +--echo # The lock on kp1=2 should inhibit the following INSERT: +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values ( 2,5,9999); +rollback; + +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; + +--echo # +--echo # Test that locks on ranges on non-unique secondary keys inhibit +--echo # modifications of the contents of these ranges +--echo # + +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +eval +create table t1 ( + kp1 int not null, + kp2 int not null, + a int, + key(kp1, kp2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; + +connect (con1,localhost,root,,); +connection con1; +begin; +--replace_column 10 # +explain +select * from t1 where kp1=2 for update; +select * from t1 where kp1=2 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (2, 9, 9999); + +--error ER_LOCK_WAIT_TIMEOUT +delete from t1 where kp1=2 and kp2=5; + +# Update that "moves a row away" from the locked range +--error ER_LOCK_WAIT_TIMEOUT +update t1 set kp1=333 where kp1=2 and kp2=3; + +# Update that "moves a row into" the locked range +--error ER_LOCK_WAIT_TIMEOUT +update t1 set kp1=2 where kp1=1 and kp2=8; + +rollback; + +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; + +--echo # +--echo # Transaction isolation test +--echo # + +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; + +--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; + +--echo # Examine the result: +--echo # pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +--echo # pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +--echo # (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Same test as above, but check the range scan +--echo # + +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; + +--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; + +--echo # Examine the result: +--echo # pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Same as above, but test SELECT FOR UPDATE. +--echo # +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; + +--echo # TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +select * from t1 where pk=2 for update; +select * from t1 where pk=2 lock in share mode; +select * from t1 where pk=2; + +commit; + +disconnect con1; +connection default; +drop table t1; + +if (!$PK_USES_REVERSE_CF) { +--echo # +--echo # Another no-snapshot-checking test, this time for single-statement +--echo # transaction +--echo # +eval +create table t1 ( + pk int, + a int, + name varchar(16), + primary key(pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1, 'row1'), (2,2,'row2'); + +connect (con1,localhost,root,,); +connection con1; +select get_lock('row1', 100); + +connection default; + +--echo # The following will read the first row (1,1,'row1'), and stop. + +send update t1 set a=a+100 where get_lock(name, 1000)=1; + +# Wait till the default connection has stopped: +connection con1; + +let $wait_condition= + SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "User lock" + AND INFO = "update t1 set a=a+100 where get_lock(name, 1000)=1"; +--source include/wait_condition.inc + +# Update the second row +update t1 set a=5 where pk=2; + +select release_lock('row1'); + +connection default; +reap; + +--echo # Look at the row with pk=2: +--echo # 2, 105, row2 - means the UPDATE was reading current data (Correct) +--echo # 2, 102, row - means the UPDATE read the snapshot (incorrect) +select * from t1; + +--echo # Try releasing both locks (in 5.6, we will be holding only the second one) +select release_lock(name) from t1; + +disconnect con1; +connection default; +drop table t1; +} + +--echo # +--echo # Check that I_S.processlist.state is set correctly now. +--echo # +eval +create table t1( + pk int, + a int, + primary key(pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); + +begin; +select * from t1 where pk=2 for update; + +--connect (con1,localhost,root,,) +begin; +set rocksdb_lock_wait_timeout=300; +send select * from t1 where pk=2 for update; + +connection default; +--echo # Now, will wait until we see con1 have state="Waiting for row lock" +let $wait_condition= + SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "Waiting for row lock" + AND INFO = "select * from t1 where pk=2 for update"; +--source include/wait_condition.inc + +rollback; +connection con1; +--reap +rollback; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Test range locking for ranges with HA_READ_PREFIX_LAST +--echo # +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +eval +create table t1 ( + pk1 int, + pk2 int, + a int, + primary key(pk1, pk2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 +select + A.a, B.a, A.a*10+B.a +from + t0 A, t0 B; + + +# Get a lock in another connection so that the primary transaction is not using +# STO optimization, and its locks can be seen in I_S.rocksdb_locks +--connect (con1,localhost,root,,) +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); + +connection default; +begin; +--echo # Should use ref access w/o filesort: +--replace_column 10 # +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; + +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +--echo # +--echo # Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +--echo # + +begin; +--echo # Should use range access with 2 keyparts and w/o filesort: +--replace_column 10 # +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; + +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +connection con1; +rollback; + +connection default; +drop table t0, t1; + +--echo # +--echo # A bug: range locking was not used when scan started at table start or end +--echo # +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; + +create table t1 ( + pk int not null, + a int, + primary key(pk) +) engine=rocksdb; + +insert into t1 select a*2,a*2 from t10; + +connection con1; +begin; +select * from t1 where pk=500 for update; +connection default; + +begin; +select * from t1 where pk<10 order by pk limit 10 for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +connection con1; +rollback; +disconnect con1; + +connection default; +drop table t0,t10,t1; + +--echo # +--echo # Range locking and READ-COMMITTED isolation level +--echo # +connect (con1,localhost,root,,); +connection con1; +set session transaction isolation level read committed; +create table t1 ( + pk int not null, + a int, + primary key(pk) +) engine=rocksdb; + +insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +begin; +select * from t1 where pk between 2 and 5 for update; +let $select_from_is_rowlocks_current_trx_only=1; +--echo # Below should show individual row locks, not locked range: +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +begin; +update t1 set a=a+1 where pk between 2 and 5; +let $select_from_is_rowlocks_current_trx_only=1; +--echo # Below should show individual row locks, not locked range: +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +drop table t1; +disconnect con1; +connection default; + +--echo # +--echo # Range Locking and READ-COMMITTED, another test +--echo # +create table t1 ( + pk int, + a int, + b int, + primary key (pk), + key(a) +) engine=rocksdb; + +insert into t1 values +(1, 100, 1000), +(2, 200, 2000), +(3, 300, 3000); + +set transaction isolation level repeatable read; +begin; +update t1 set b = b + 1 where a > 200; + +connect (con1,localhost,root,,); +connection con1; +set transaction isolation level read committed; +begin; +insert into t1 values (4, 150, 1500); +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (5, 250, 1500); + +rollback; + +disconnect con1; +connection default; +rollback; +drop table t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test new file mode 100644 index 000000000000..5c599238a0a7 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking.test @@ -0,0 +1,6 @@ + +--let pk_cf=default +--let sk_cf=default + +--source range_locking.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test-master.opt b/mysql-test/suite/rocksdb/t/range_locking_conc_test-master.opt new file mode 100644 index 000000000000..b20b064d5af7 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test-master.opt @@ -0,0 +1 @@ +--rocksdb_deadlock_detect=1 --rocksdb_lock_wait_timeout=10 diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.py b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py new file mode 100644 index 000000000000..d048f8823834 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py @@ -0,0 +1,520 @@ +""" +This script tests Range Locking. See + mysql-test/suite/rocksdb/t/range_locking_conc_test.txt for details. + +Usage: + + python3 suite/rocksdb/t/range_locking_conc_test.py \ + [--progress] [--verbose] \ + root 127.0.0.1 $MASTER_MYPORT test t1 \ + num_inserts num_insert_threads \ + num_group_ops num_group_threads + + + For Example: + + time python3 suite/rocksdb/t/range_locking_conc_test.py --progress root 127.0.0.1 3314 test t1 2000 20 10000 40 + +""" + +import hashlib +import MySQLdb +from MySQLdb.constants import ER +import os +import random +import signal +import sys +import threading +import time +import string +import traceback + +# MAX_PK_VAL = 1000*1000*1000 +MAX_PK_VAL = 1000 * 1000 + +show_progress = False +verbose_output = False + +counter_n_inserted = 0 +counter_n_deleted = 0 +counter_n_insert_failed = 0 +counter_n_groups_created = 0 +counter_n_group_create_fails = 0 +counter_n_groups_verified = 0 +counter_n_groups_deleted = 0 + + +def is_lock_error(exc): + error_code = exc.args[0] + return error_code == ER.LOCK_WAIT_TIMEOUT or error_code == ER.LOCK_DEADLOCK + + +# +# Watcher prints the test progress and some status variables once per second +# +class Watcher(threading.Thread): + Instance = None + + def __init__(self, con): + threading.Thread.__init__(self) + self.should_stop = False + self.finished = False + self.con = con + self.start() + + def run(self): + global counter_n_inserted + global counter_n_deleted + global counter_n_insert_failed + global counter_n_groups_created + global counter_n_group_create_fails + global counter_n_groups_verified + global counter_n_groups_deleted + event = threading.Event() + + save_counter_n_inserted = 0 + save_counter_n_deleted = 0 + save_counter_n_insert_failed = 0 + save_counter_n_groups_created = 0 + save_counter_n_group_create_fails = 0 + save_counter_n_groups_verified = 0 + save_counter_n_groups_deleted = 0 + save_wait_count = 0 + n = 0 + cur = self.con.cursor() + try: + while not self.should_stop: + event.wait(1) + + cur.execute("show status like '%rocksdb_locktree_wait_count%'") + row = cur.fetchone() + wait_count = int(row[1]) + + print("== %d ========================" % n) + print( + "counter_n_inserted=%d" + % (counter_n_inserted - save_counter_n_inserted) + ) + print( + "counter_n_deleted=%d" + % (counter_n_deleted - save_counter_n_deleted) + ) + print( + "counter_n_insert_failed=%d" + % (counter_n_insert_failed - save_counter_n_insert_failed) + ) + print( + "counter_n_groups_created=%d" + % (counter_n_groups_created - save_counter_n_groups_created) + ) + print( + "counter_n_group_create_fails=%d" + % (counter_n_group_create_fails - save_counter_n_group_create_fails) + ) + print( + "counter_n_groups_verified=%d" + % (counter_n_groups_verified - save_counter_n_groups_verified) + ) + print( + "counter_n_groups_deleted=%d" + % (counter_n_groups_deleted - save_counter_n_groups_deleted) + ) + print("wait_count=%d" % (wait_count - save_wait_count)) + + save_counter_n_inserted = counter_n_inserted + save_counter_n_deleted = counter_n_deleted + save_counter_n_insert_failed = counter_n_insert_failed + save_counter_n_groups_created = counter_n_groups_created + save_counter_n_group_create_fails = counter_n_group_create_fails + save_counter_n_groups_verified = counter_n_groups_verified + save_counter_n_groups_deleted = counter_n_groups_deleted + save_wait_count = wait_count + n += 1 + + except Exception as e: + self.exception = traceback.format_exc() + print("Watcher caught (%s)" % (e)) + + finally: + self.finish() + + def finish(self): + n = 0 + # Do nothing + + +# +# A worker is one client thread producing the benchmark workload +# +class Worker(threading.Thread): + Instance = None + + def __init__(self, con, table_name, worker_type_arg, num_inserts): + threading.Thread.__init__(self) + self.finished = False + self.num_inserts = num_inserts + con.autocommit(False) + self.con = con + self.rand = random.Random() + self.exception = None + self.table_name = table_name + self.worker_type = worker_type_arg + Worker.Instance = self + self.start() + + def run(self): + self.rand.seed(threading.get_ident()) + my_id = threading.get_ident() + try: + self.cur = self.con.cursor() + if self.worker_type == "insert": + self.run_inserts() + if self.worker_type == "join_group": + self.run_create_groups() + except Exception as e: + print(e) + self.exception = traceback.format_exc() + print("THR %d caught (%s)" % (my_id, e)) + + finally: + self.finish() + + # + # Insert one row, making sure this doesn't break any groups + # + def run_one_insert(self): + global counter_n_inserted + global counter_n_deleted + global counter_n_insert_failed + cur = self.cur + # pk = self.rand.randint(1, MAX_PK_VAL) + # Note: here the distribution is intentionally 2x wider than the grouping + # thread has. + pk = int(self.rand.normalvariate(MAX_PK_VAL / 2, MAX_PK_VAL / 50.0)) + + cur.execute("begin") + do_commit = False + cur.execute( + "select pk,parent_id,group_list from t1 where pk>=%s limit 1 for update", + (pk,), + ) + row = cur.fetchone() + group_to_delete = None + if row is None: + # print("No row found, inserting %d" % (pk+1000*1000)) + cur.execute("insert into t1 (pk) values(%s)", (pk + 1000 * 1000,)) + do_commit = True + else: + if (row[0] - pk) > 2 and row[1] is None: + # print("Row found, inserting into gap, %d" % pk) + cur.execute("insert into t1 (pk) values(%s)", (pk,)) + do_commit = True + else: + # print("Row found, grouped or too tight") + if row[2]: + # if parent_id is present, use it + group_to_delete = row[0] + # print("About to delete %d" % group_to_delete) + do_commit = False + + if do_commit: + cur.execute("commit") + counter_n_inserted += 1 + return 1 + else: + counter_n_insert_failed += 1 + if group_to_delete: + counter_n_deleted += 5 + self.delete_group(group_to_delete, True) + cur.execute("commit") + else: + cur.execute("rollback") + return 0 + + def run_one_create_group(self): + global counter_n_groups_created + global counter_n_group_create_fails + global counter_n_groups_deleted + cur = self.cur + # pk = self.rand.randint(1, MAX_PK_VAL) + pk = int(self.rand.normalvariate(MAX_PK_VAL / 2, MAX_PK_VAL / 100)) + + n_rows = 0 + n_groups_deleted = 0 + first_pk = None + cur.execute( + "select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", + (pk,), + ) + row = cur.fetchone() + while row is not None: + if first_pk is None: + first_pk = row[0] + group_list = str(first_pk) + else: + group_list = group_list + "," + str(row[0]) + + last_pk = row[0] + if row[1] is not None: + # Found a row in a group + # Continue until group end. + found_next_group = False + row = cur.fetchone() + while row is not None: + if row[1] is None: + found_next_group = True + first_pk = row[0] + group_list = str(first_pk) + break + row = cur.fetchone() + + if not found_next_group: + break + + if row[2] is not None: + # Found a group leader row. + ungrouped_ids = self.delete_group(row[0], False) + n_groups_deleted += 1 + i = 1 + n_rows += 1 + while n_rows < 5: + group_list = group_list + "," + str(ungrouped_ids[i]) + last_pk = ungrouped_ids[i] + i += 1 + n_rows += 1 + break + n_rows += 1 + row = cur.fetchone() + + if n_rows == 5 or n_groups_deleted > 0: + # Ok we got 5 rows in a row and they are all standalone + # Create a group. + # print("Creating group %d" % first_pk) + cur.execute( + "update t1 set group_list=%s where pk=%s", + ( + group_list, + first_pk, + ), + ) + cur.execute( + "update t1 set parent_id=%s where pk > %s and pk <=%s", + (first_pk, first_pk, last_pk), + ) + cur.execute("commit") + counter_n_groups_created += 1 + counter_n_groups_deleted += n_groups_deleted + return 1 + else: + # print("Failed to join a group") + counter_n_group_create_fails += 1 + cur.execute("rollback") + return 0 + + # + # Verify and delete the group + # @return An array listing the deleted PKs + # + def delete_group(self, group_id, delete_rows): + global counter_n_groups_verified + cur = self.con.cursor() + cur.execute( + "select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", + (group_id,), + ) + first_pk = None + n_rows = 0 + + row = cur.fetchone() + while row is not None: + if first_pk is None: + first_pk = row[0] + group_list = str(first_pk) + group_arr = [] + group_arr.append(first_pk) + group_list_base = row[2] + if first_pk != group_id: + self.raise_error("First row is not the group leader!") + else: + group_list = group_list + "," + str(row[0]) + group_arr.append(row[0]) + + last_pk = row[0] + if row[0] != first_pk and row[1] != first_pk: + self.raise_error( + "Row in group has wrong parent_id (expect %s got %s)" + % (first_pk, row[1]) + ) + break + if row[0] != first_pk and row[2] is not None: + self.raise_error("Row in group is a group leader?") + break + n_rows += 1 + row = cur.fetchone() + + if n_rows != 5: + self.raise_error( + "Expected %d rows got %d" + % ( + 5, + n_rows, + ) + ) + if group_list != group_list_base: + self.raise_error( + "Group contents mismatch: expected '%s' got '%s'" + % (group_list_base, group_list) + ) + # Ok, everything seems to be in order. + if delete_rows: + cur.execute( + "delete from t1 where pk>=%s and pk<=%s", + ( + group_id, + last_pk, + ), + ) + else: + cur.execute( + "update t1 set parent_id=NULL, group_list=NULL where pk>=%s and pk<=%s", + ( + group_id, + last_pk, + ), + ) + + counter_n_groups_verified += 1 + return group_arr + + def raise_error(self, msg): + print("Data corruption detected: " + msg) + sys.exit("Failed!") + + def run_inserts(self): + # print("Worker.run_inserts") + i = 0 + while i < self.num_inserts: + try: + i += self.run_one_insert() + except MySQLdb.OperationalError as e: + self.con.rollback() + cur = self.con.cursor() + if not is_lock_error(e): + raise e + + def run_create_groups(self): + # print("Worker.run_create_groups") + i = 0 + while i < self.num_inserts: + try: + i += self.run_one_create_group() + except MySQLdb.OperationalError as e: + self.con.rollback() + cur = self.con.cursor() + if not is_lock_error(e): + raise e + + def finish(self): + self.finished = True + + +if __name__ == "__main__": + if len(sys.argv) != 10 and len(sys.argv) != 11: + print( + "Usage: range_locking_conc_test.py " + "[--progress] " + "user host port db_name table_name " + "num_inserts num_insert_threads " + "num_grp_ops num_group_threads" + ) + sys.exit(1) + i = 1 + if sys.argv[i] == "--progress": + show_progress = True + i += 1 + + if sys.argv[i] == "--verbose": + verbose_output = True + i += 1 + + user = sys.argv[i] + i += 1 + + host = sys.argv[i] + i += 1 + + port = int(sys.argv[i]) + i += 1 + + db = sys.argv[i] + i += 1 + + table_name = sys.argv[i] + i += 1 + + num_inserts = int(sys.argv[i]) + i += 1 + + num_insert_workers = int(sys.argv[i]) + i += 1 + + num_group_ops = int(sys.argv[i]) + i += 1 + + num_group_workers = int(sys.argv[i]) + i += 1 + + con = MySQLdb.connect(user=user, host=host, port=port, db=db) + con.cursor().execute("set global rocksdb_lock_wait_timeout=20") + con.cursor().execute("drop table if exists t1") + con.cursor().execute( + "create table t1 ( " + " pk bigint primary key, " + " group_list varchar(128), " + " parent_id bigint " + ") engine=rocksdb;" + ) + + worker_failed = False + workers = [] + worker_type = "insert" + num_loops = num_inserts + for i in range(num_insert_workers + num_group_workers): + worker = Worker( + MySQLdb.connect(user=user, host=host, port=port, db=db), + table_name, + worker_type, + num_loops, + ) + workers.append(worker) + if i == num_insert_workers - 1: + worker_type = "join_group" + num_loops = num_group_ops + + # A watcher thread to print the statistics periodically + if show_progress: + watcher = Watcher(MySQLdb.connect(user=user, host=host, port=port, db=db)) + + for w in workers: + w.join() + if w.exception: + print("Worker hit an exception:\n%s\n" % w.exception) + worker_failed = True + + # Stop the watcher + if show_progress: + watcher.should_stop = True + watcher.join() + + if verbose_output: + print("\n") + print("rows_inserted: %d" % counter_n_inserted) + print("rows_deleted: %d" % counter_n_deleted) + print("rows_insert_failed: %d" % counter_n_insert_failed) + print("groups_created: %d" % counter_n_groups_created) + print("groups_verified: %d" % counter_n_groups_verified) + print("groups_deleted: %d" % counter_n_groups_deleted) + print("group_create_fails: %d" % counter_n_group_create_fails) + + if worker_failed: + sys.exit(1) diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.test b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test new file mode 100644 index 000000000000..0b0d4681bdb8 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test @@ -0,0 +1,18 @@ + +# +# Test Range Locking behavior. See +# ./suite/rocksdb/t/range_locking_conc_test.txt for details +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +set @save_rlwt=@@rocksdb_lock_wait_timeout; +let $SERVER_PORT=`select @@port`; +let $exec = /usr/bin/python3 suite/rocksdb/t/range_locking_conc_test.py root 127.0.0.1 $SERVER_PORT test t1 2000 20 30000 10; + +--echo # Run range_locking_conc_test.py +exec $exec; + +set global rocksdb_lock_wait_timeout= @save_rlwt; +DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt new file mode 100644 index 000000000000..f10bfe33925f --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt @@ -0,0 +1,91 @@ + + +A concurrent test for Range Locking code + +== Requirements == + +The idea is to have a concurrent workload, where +- Some clients make modifications to the database that use range locking (let's + denote these RL-UPDATES) + +- There is some concurrent data modification activity, which is likely to + make RL-UPDATES corrupt the data if transaction isolation does not perform + proper range locking + +- There is a way to detect this data corruption. First suggestion: have some + invariants that must remain true regardless of any action done by + RL-UPDATES. We can run the workload, then stop it and verify the invariants + still hold. + +== Basic idea == + +Rows and groups. + +Consider a table: + +create table t1 ( + pk bigint primary key, + group_list varchar(128), + parent_id bigint +) engine=rocksdb; + + +We allow the following to be stored: + +1. Individual rows. An individual row has group_list=NULL, parent_id=NULL. + +2. Groups. + +A group is 5 (potentially could be some other number) of rows with adjacent +PK values. + +The row with the smallest PK value is the "group leader" and has +group_list=(comma-separated-list-of-pks-of-group members). + +The rest of the rows are "followers" and have parent_id=$GROUP_LEADER.pk + +Example of a group: + +mysql> select * from t1 where pk>=720418192 order by pk limit 5; ++-----------+---------------------------------------------------+-----------+ +| pk | group_list | parent_id | ++-----------+---------------------------------------------------+-----------+ +| 720418192 | 720418192,721972360,730798130,741595383,742883456 | NULL | +| 721972360 | NULL | 720418192 | +| 730798130 | NULL | 720418192 | +| 741595383 | NULL | 720418192 | +| 742883456 | NULL | 720418192 | ++-----------+---------------------------------------------------+-----------+ +5 rows in set (0.01 sec) + + +Operations: +- Insert an individual row. It is obvious we may not insert a row into a group. + +- Convert 5 individual rows into a group. One needs range locking to prevent + other threads from deleting these rows or putting them into another group. + +- Disband a group (convert it back into 5 individual rows). + When we are disbanding a group, we verify it to be valid. + +- Delete a group. If we attempt to insert a row and hit a group leader, we + don't insert the row and delete the whole group we've hit, instead. (when + deleting the group, we also verify it to be valid) + This provides some balance between inserts and deletes. + +=== Making sure lock contention happens === + +We use normal distribution (rand.normalvariate) to pick random PK values, +which are then used to make an attempt to insert a row or create a group. + +This creates a much greater contention than the uniform distribution. + +With sufficiently small sigma parameter, the contention seems to be +sufficiently high. + +=== Implementation === + +range_locking_conc_test.py implements the above operations. + + + diff --git a/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test new file mode 100644 index 000000000000..2a5966b65c3a --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test @@ -0,0 +1,196 @@ +--source suite/rocksdb/include/have_range_locking.inc + +# +# This is deadlock_tracking.test, variant for running with Range Locking: +# - Deadlock #5 is disabled, it requires LOCK IN SHARE MODE tests +# - In the result file, SHOW ENGINE ROCKSDB TRANSACTION STATUS does not print +# deadlock information. +# +set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set @prior_deadlock_detect = @@rocksdb_deadlock_detect; +set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; +set global rocksdb_deadlock_detect = on; +set global rocksdb_lock_wait_timeout = 10000; +--echo # Clears deadlock buffer of any prior deadlocks. +set global rocksdb_max_latest_deadlocks = 0; +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +let $engine = rocksdb; + +--source include/count_sessions.inc +connect (con1,localhost,root,,); +let $con1= `SELECT CONNECTION_ID()`; + +connect (con2,localhost,root,,); +let $con2= `SELECT CONNECTION_ID()`; + +connect (con3,localhost,root,,); +let $con3= `SELECT CONNECTION_ID()`; + +connection default; +eval create table t (i int primary key) engine=$engine; +insert into t values (1), (2), (3); +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +echo Deadlock #1; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +echo Deadlock #2; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 10; + +echo Deadlock #3; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 1; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +connection con3; +set rocksdb_deadlock_detect_depth = 2; + +--echo # Range locking code will report deadlocks, because it doesn't honor +--echo # rocksdb_deadlock_detect_depth: +echo Deadlock #4; +connection con1; +begin; +select * from t where i=1 for update; + +connection con2; +begin; +select * from t where i=2 for update; + +connection con3; +begin; +select * from t where i=3 for update; + +connection con1; +send select * from t where i=2 for update; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc + +send select * from t where i=3 for update; + +connection con3; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con2 and waiting_key != ""; +--source include/wait_condition.inc + +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +--error ER_LOCK_DEADLOCK +select * from t where i=1 for update; +select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +rollback; + +connection con2; +reap; +rollback; + +connection con1; +reap; +rollback; + +connection default; +set global rocksdb_max_latest_deadlocks = 5; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +--disable_testcase BUG#0000 +echo Deadlock #5; +connection con1; +begin; +select * from t where i=1 for update; + +connection con2; +begin; +select * from t where i=2 for update; + +connection con3; +begin; +select * from t where i=3 lock in share mode; + +connection con1; +select * from t where i=100 for update; +select * from t where i=101 for update; +send select * from t where i=2 for update; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc + +select * from t where i=3 lock in share mode; +select * from t where i=200 for update; +select * from t where i=201 for update; + +--error ER_LOCK_DEADLOCK +select * from t where i=1 lock in share mode; +rollback; + +connection con1; +reap; +rollback; + +connection con3; +rollback; + +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +--enable_testcase +echo Deadlock #6; +connection con1; +create table t1 (id int primary key, value int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5); +begin; +update t1 set value=value+100 where id=1; +update t1 set value=value+100 where id=2; + +connection con2; +begin; +update t1 set value=value+200 where id=3; + +connection con1; +send update t1 set value=value+100 where id=3; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc +--error ER_LOCK_DEADLOCK +update t1 set value=value+200 where id=1; + +# con2 tx is automatically rolled back +connection con1; +reap; +select * from t1; +drop table t1; + +connection default; + +disconnect con1; +disconnect con2; +disconnect con3; + +set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout; +set global rocksdb_deadlock_detect = @prior_deadlock_detect; +drop table t; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 0; +--echo # Clears deadlock buffer of any existent deadlocks. +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt new file mode 100644 index 000000000000..d0087e2a77b0 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt @@ -0,0 +1 @@ +--rocksdb_use_range_locking=1 --rocksdb_max_lock_memory=1024 diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation.test b/mysql-test/suite/rocksdb/t/range_locking_escalation.test new file mode 100644 index 000000000000..5a6e9fa6616f --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_escalation.test @@ -0,0 +1,39 @@ +# +# Range Locking - Lock Escalation Tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log + + +show variables like 'rocksdb_use_range_locking'; +show variables like 'rocksdb_max_lock_memory'; +show status like 'rocksdb_locktree_escalation_count'; +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +#begin; +#insert into t1 values (1000111,100011); +#connect (con1,localhost,root,,); +#connection con1; + +insert into t1 +select + A.a + B.a*10 + C.a*100 + D.a*1000, + 12345 +from t0 A, t0 B, t0 C, t0 D; + +select count(*) from t1; + +#connection default; +#disconnect con1; +show status like 'rocksdb_locktree_escalation_count'; + +drop table t0,t1; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_partial_index.test b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test new file mode 100644 index 000000000000..ce49737b2f61 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test @@ -0,0 +1,120 @@ +# +# Range Locking and Partial Indexes +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--source include/have_debug_sync.inc +--enable_connect_log + +create table t0(a int primary key) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk1 int, + pk2 int, + a int not null, + b int, + primary key (pk1, pk2), + key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5' +) engine=rocksdb; + +# pk1=1 IS materialized prefix (N=10) +insert into t1 select + 1, + A.a, + 100 + A.a, + 123456 +from t0 A; +# Run a SELECT so that it is acually materialized: +select * from t1 force index (key1) where pk1=1; + +# pk1=2 IS NOT materialized (N=3) +insert into t1 select + 2, + A.a, + 100 + A.a, + 123456 +from t0 A limit 3; + +# and some more rows +insert into t1 select + 10000 + A.a +10 *B.a +100*C.a, + A.a, + 100 + A.a, + 123456 +from t0 A, t0 B, t0 C; + +create table t3(pk int primary key); + +connect (con2,localhost,root,,); +connection con2; +begin; +insert into t3 values(3333333); +connection default; + +--echo # +--echo # First, test a query with range lock +--echo # + +--replace_column 10 # +explain +select * from t1 force index (key1) where pk1>=1 and pk1<=10; + +connect (con1,localhost,root,,); +connection con1; +begin; +--echo # Allocate a snapshot +select * from t0 where a=3; + +connection default; +--echo # Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); + +connection con1; +--echo # This doesn't see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10; +--echo # This DOES see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update; + +let $order_by_rowkey=1; +let $SECOND_INDEX_NAME=key1; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +--echo # +--echo # Now, test a query with LockingIterator +--echo # +delete from t1 where b=99999; + +begin; +--echo # Allocate a snapshot +select * from t0 where a=3; + +connection default; +--echo # Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); + +connection con1; +--echo # This doesn't see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15; +--echo # This DOES see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update; + +let $order_by_rowkey=1; +let $SECOND_INDEX_NAME=key1; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +disconnect con1; +connection default; + +disconnect con2; + +drop table t0, t1,t3; diff --git a/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test new file mode 100644 index 000000000000..9bbb1b9b3927 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test @@ -0,0 +1,70 @@ +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--source include/have_debug_sync.inc + +select @@rocksdb_use_range_locking; + +--disable_warnings +set debug_sync='RESET'; +--enable_warnings +# +# Testcase for iterator snapshot refresh +# +create table ten(a int primary key); +insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table one_k(a int primary key); +insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C; + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 select a,a from ten; +insert into t1 select a+40, a+40 from ten; +insert into t1 select a+100, a+100 from one_k; +delete from t1 where pk=44; +set global rocksdb_force_flush_memtable_and_lzero_now=1; + +# Ok, now the table has these PK ranges: +# 0..9 40..49 100...1000 +# and all rows have pk=a +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +connection con1; +begin; +set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont'; +send +update t1 set a=a+100 where pk < 3 or pk between 10 and 50; + +# The query is how stuck at the start of the second range. + + +## con2> +connection con2; +set debug_sync='now WAIT_FOR con1_stopped'; + +# Make some changes to check if the iterator is reading current data or +# snapshot +insert into t1 values (44,5000); +delete from t1 where pk= 42; +update t1 set a=5000 where pk between 40 and 45; +set global rocksdb_force_flush_memtable_and_lzero_now=1; + +set debug_sync='now SIGNAL con1_cont'; + +connection con1; +#--error ER_GET_ERRMSG +reap; +select * from t1 where pk<100; + +commit; +disconnect con1; +disconnect con2; +connection default; +set debug_sync='RESET'; + +drop table t1, ten, one_k; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test new file mode 100644 index 000000000000..8b993764235a --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test @@ -0,0 +1,12 @@ +# +# Range locking tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +--let pk_cf=rev:cf1 +--let PK_USES_REVERSE_CF=1 + +--source range_locking.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test new file mode 100644 index 000000000000..542d76661e29 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test @@ -0,0 +1,308 @@ +# +# Range Locking : tests for SeekForUpdate feature +# + +--source include/have_rocksdb.inc +--source include/have_debug_sync.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log +show variables like 'rocksdb_use_range_locking'; + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int, + a int, + primary key (pk) +) engine=rocksdb; + +insert into t1 select + A.a + B.a*10 + C.a*100, + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; + +--echo # Make another connection to get the lock tree out of the STO-mode +connect (con1,localhost,root,,); +connection con1; +begin; +select * from t1 where pk=10 for update; + +connection default; +begin; +select * from t1 where pk=11 for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--echo # Now, we will just see locks on 10=0xA and 11=0xB: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +--echo # +--echo # SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT +--echo # +--replace_column 10 # +explain +select * from t1 where pk>=500 order by pk limit 3 for update; +select * from t1 where pk>=500 order by pk limit 3 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + + +begin; +select * from t1 where pk=11 for update; +explain +select * from t1 order by pk limit 3 for update; +select * from t1 order by pk limit 3 for update; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0, t1; + + +--echo # +--echo # Concurrent tests: let one thread do SeekForUpdate and the other +--echo # interfere by committing modifications +--echo # + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int, + a int, + primary key (pk) +) engine=rocksdb; + +insert into t1 select + A.a + B.a*10 + C.a*100, + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; + +select * from t1 where pk<10; +delete from t1 where pk<10; +select * from t1 where pk<10; + + +--echo # Test what happens when another transaction commits a row +--echo # right before the range we are about to lock (nothing) + +--replace_column 10 # +explain +select * from t1 where pk >=5 order by pk limit 3 for update; + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send select * from t1 where pk >=5 order by pk limit 3 for update; + +connect (con1,localhost,root,,); +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (3,3); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; +rollback; + +delete from t1 where pk=3; + +--echo # +--echo # Now, repeat the test but let the other transaction insert the row into +--echo # the range we are locking + +--replace_column 10 # +explain +select * from t1 where pk >=5 order by pk limit 1 for update; + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (8,8); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +delete from t1 where pk=8; + +--echo # +--echo # Repeat the third time, this time deleting the row that SeekForUpdate saw +--echo # +insert into t1 values (7,7); + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; + +rollback; + +--echo # +--echo # Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT +--echo # error. MyRocks code should now be prepared that data reads cause this +--echo # error +--echo # +insert into t1 values (7,7); + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +begin; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +--error ER_LOCK_WAIT_TIMEOUT +reap; + +rollback; + +connection con1; +rollback; +connection default; + +--echo # +--echo # Test the thd_killed check in the iterator +--echo # +let $conn_id=`select connection_id()`; +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +--disable_query_log +eval kill query $conn_id; +--enable_query_log +--echo kill query CONN_ID; +connection default; +--error ER_GET_ERRMSG +reap; +rollback; + +--echo # +--echo # Backward scan test +--echo # +connection con1; +begin; +select * from t1 where pk=500 for update; +connection default; + +insert into t1 values + (1001, 1001), + (1005, 1005), + (1007, 1007), + (1010, 1010); + +begin; +select * from t1 order by pk desc limit 2 for update; + +let $select_from_is_rowlocks_current_trx_only=1; + +--echo # The below will lock from pk=1007 (0x3ef) till the end of the table: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +begin; +select * from t1 where pk <1007 order by pk desc limit 2 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection con1; +rollback; + +connection default; +rollback; + +--echo # +--echo # Backward scan test 2: error condition +--echo # +connection con1; +begin; +select * from t1 where pk=1010 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 order by pk desc limit 2 for update; +rollback; + +connection con1; +rollback; +begin; +select * from t1 where pk=1007 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 order by pk desc limit 2 for update; +rollback; + +connection con1; +rollback; + +disconnect con1; +connection default; +drop table t0,t1; + +--echo # +--echo # A test: full table scan doesn't lock gaps +--echo # + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 values (10,10),(20,20),(30,30); + +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +connection con1; +begin; + +select * from t1 for update; + +connection con2; + +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (5,5); + +connection con1; +rollback; + +disconnect con1; +disconnect con2; +connection default; +drop table t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc new file mode 100644 index 000000000000..6ab2f31cae58 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc @@ -0,0 +1,55 @@ + + +--source include/have_rocksdb.inc +--source include/have_debug_sync.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log +show variables like 'rocksdb_use_range_locking'; + + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +eval create table t1 ( + pk int, + a int, + primary key (pk) comment '$cf' +) engine=rocksdb; + +insert into t1 (pk) +select + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; +delete from t1 where pk<100; + +connect (con1,localhost,root,,); +connection con1; + +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 5 for update; + +connection default; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 (pk) values +(10),(20),(30),(40),(50); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection con1; +reap; +--echo # This must return 1, no 5: +select lock_count from information_schema.rocksdb_trx +where thread_id=CONNECTION_ID(); + +rollback; +disconnect con1; +connection default; +drop table t0, t1; + +--source range_locking_seek_for_update_iter_end.inc +set global rocksdb_enable_iterate_bounds=off; +--source range_locking_seek_for_update_iter_end.inc +set global rocksdb_enable_iterate_bounds=on; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test new file mode 100644 index 000000000000..703331cab9a2 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test @@ -0,0 +1,4 @@ + +--let cf=rlsfu_test +--source range_locking_seek_for_update2.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test new file mode 100644 index 000000000000..0620593b4e78 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test @@ -0,0 +1,4 @@ + +--let cf=rev:rlsfu_test + +--source range_locking_seek_for_update2.inc diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc new file mode 100644 index 000000000000..3b0fb6c53b37 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc @@ -0,0 +1,41 @@ +--echo # +--echo # A testcase for locking at the end of the scan +--echo # +eval create table t1 ( + pk int, + primary key (pk) comment '$cf' +) engine=rocksdb; + +connect (con1,localhost,root,,); +connection con1; + +insert into t1 values (1), (10), (100); + +begin; +select * from t1 for update; + +connection default; +select * from t1; + +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (150); + +connection con1; +rollback; + +begin; +--replace_column 10 # +explain +select * from t1 order by pk desc for update; +select * from t1 order by pk desc for update; + +connection default; +select * from t1; + +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (0); + +disconnect con1; +connection default; +drop table t1; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test new file mode 100644 index 000000000000..c6e4e457897b --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test @@ -0,0 +1,202 @@ +# +# Test for shared lock support for range locking +# +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log + +select @@rocksdb_use_range_locking; + +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + + +insert into t1 select a,a from t0; + +--echo # A basic test for shared locks + +begin; +select * from t1 where pk=3 for update; +select * from t1 where pk=5 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connect (con1,localhost,root,,); +connection con1; +begin; +select * from t1 where pk=5 lock in share mode; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; +--echo # Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +--echo # Now, TRX2_ID should be gone: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; + +--echo # Get a read lock on pk=3 (where we have a write lock). +--echo # The result should be that we will still have a write lock +select * from t1 where pk=3 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +--echo # Get a write lock on pk=5 (where we have a read lock). +--echo # The result should be that we will have a write lock. +select * from t1 where pk=5 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; +rollback; + +--echo # +--echo # Test if a read lock inhibits write locks +--echo # + +begin; +select * from t1 where pk=2 lock in share mode; +select * from t1 where pk=8 for update; + +connection con1; +begin; + +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk=2 for update; + +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk between 0 and 4 for update; + +--error ER_LOCK_WAIT_TIMEOUT +delete from t1 where pk=2; + +--echo # Get a shared lock +select * from t1 where pk=2 lock in share mode; + +--echo # But this should still prevent us from acquiring a write lock on that value: +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk=2 for update; + +rollback; +connection default; +rollback; + +drop table t1; +create table t1 ( + pk int not null primary key, + a int not null, + key(a) +) engine=rocksdb; + +insert into t1 +select + A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a +from + t0 A, t0 B, t0 C, t0 D; +set global rocksdb_force_flush_memtable_now=1; + +connection con1; +begin; +select * from t1 where pk=900 for update; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connection default; +begin; +--replace_column 10 # +explain +select * from t1 where a between 2 and 5 lock in share mode; +select * from t1 where a between 2 and 5 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +disconnect con1; + +drop table t0,t1; + +--echo # +--echo # Test shared point locks and lock escalation +--echo # +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 +select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C; + +show status like 'rocksdb_locktree_current_lock_memory'; + +connect (con1,localhost,root,,); +connection con1; + +begin; +--echo # CON1: get some shared locks +select * from t1 where pk=1001 lock in share mode; +select * from t1 where pk=1100 lock in share mode; +select * from t1 where pk=1200 lock in share mode; + +select * from t1 where pk=2500 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connection default; +begin; +--echo # DEFAULT: get the same locks so we have locks with multiple owners +select * from t1 where pk=1001 lock in share mode; +select * from t1 where pk=1100 lock in share mode; +select * from t1 where pk=1200 lock in share mode; + +--echo # DEFAULT: get shared locks with one owner: +select * from t1 where pk=2510 lock in share mode; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + + +--echo # DEFAULT: exclusive locks on 0-10: +insert into t1 select A.a, 0 from t0 A; + +connection con1; +--echo # CON1: exclusive locks on 2000-2010: +insert into t1 select 2000+A.a, 0 from t0 A; + +let $order_by_rowkey=1; +#select * from information_schema.rocksdb_locks; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; +show status like 'rocksdb_locktree_current_lock_memory'; +set @save_mlm= @@rocksdb_max_lock_memory; + +--echo # Set the limit to cause lock escalation: +set @cur_mem_usage= (select + variable_value + from + performance_schema.global_status + where + variable_name='rocksdb_locktree_current_lock_memory'); + +set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED); + +connection con1; +insert into t1 select 3000+A.a, 0 from t0 A; + +#select * from information_schema.rocksdb_locks; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection con1; +rollback; +connection default; +rollback; + +disconnect con1; +set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED); + +drop table t0, t1; + + diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test index d41fe8900623..52e33a69c155 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb.test +++ b/mysql-test/suite/rocksdb/t/rocksdb.test @@ -3,6 +3,9 @@ --source include/have_compact_range_for_drop_table.inc --source include/count_sessions.inc +# Does SHOW WARNINGS and SHOW STATUS which change in Range Locking mode +--source suite/rocksdb/include/not_range_locking.inc + # # RocksDB Storage Engine tests # diff --git a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test index 47818bfdbe13..3aa51b7be80f 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test @@ -27,6 +27,10 @@ # In all cases, RR gets snapshot conflict errors if non-first rows get # deleted by another transaction after scanning. +# The tests do not work with range locking as it locks it is about to +# read, first. +--source suite/rocksdb/include/not_range_locking.inc + --source include/have_rocksdb.inc --source include/have_debug_sync.inc diff --git a/mysql-test/suite/rocksdb/t/rocksdb_locks.test b/mysql-test/suite/rocksdb/t/rocksdb_locks.test index ff0927737371..8b3975723dfe 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_locks.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_locks.test @@ -5,6 +5,9 @@ # --source include/have_debug.inc +# Range locking requests locks before doing snapshot checking. +--source suite/rocksdb/include/not_range_locking.inc + --enable_connect_log create table t1 (pk int not null primary key) engine=rocksdb; diff --git a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test index 6848459c4454..2affb66f7afd 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test @@ -62,7 +62,7 @@ update t1 set c2=100 where c1=3; delete from t1 where c1 <= 2; --source include/sync_slave_sql_with_master.inc --source include/rpl_connection_slave.inc -select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; +select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; select * from t1; --echo diff --git a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test index 694594efd70f..1273a2b6f70a 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test @@ -46,6 +46,8 @@ begin work; insert into t1 values (9); insert into t1 values (10); +--echo # Fix for Range Locking: force a snapshot to be taken: +select * from t1 where a=100; update t1 set a = a + 1 where a = 2; connection con1; diff --git a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc index 5a78979f0487..63b72ce5c5ae 100644 --- a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc +++ b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc @@ -3,6 +3,8 @@ --source include/have_debug.inc --source include/have_debug_sync.inc +--source suite/rocksdb/include/not_range_locking.inc + connection master; --disable_warnings drop table if exists t1; diff --git a/mysql-test/suite/rocksdb/t/select_count_for_update.test b/mysql-test/suite/rocksdb/t/select_count_for_update.test index 2c6f5d474a1f..aa7059dfc7e6 100644 --- a/mysql-test/suite/rocksdb/t/select_count_for_update.test +++ b/mysql-test/suite/rocksdb/t/select_count_for_update.test @@ -52,9 +52,23 @@ SET lock_wait_timeout = 1; SELECT COUNT(*) FROM t1 FORCE INDEX (sk); # ... but not with LOCK IN SHARE MODE / FOR UPDATE +let $uses_range_locking=`select @@rocksdb_use_range_locking`; + +if ($uses_range_locking == "0") { +--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/ +} +if ($uses_range_locking == "1") { +--replace_regex /test.t1.sk/$FAILING_INDEX/ +} --error ER_LOCK_WAIT_TIMEOUT SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE; +if ($uses_range_locking == "0") { +--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/ +} +if ($uses_range_locking == "1") { +--replace_regex /test.t1.sk/$FAILING_INDEX/ +} --error ER_LOCK_WAIT_TIMEOUT SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE; diff --git a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test index 23ce6d452344..cf9d53ff88ae 100644 --- a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test +++ b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# Range locking only supports exclusive locks currently. +--source suite/rocksdb/include/not_range_locking.inc + # # SELECT .. LOCK IN SHARE MODE # diff --git a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test index bfa36714816b..3b8bcb033c07 100644 --- a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test +++ b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test @@ -2,5 +2,8 @@ # wl#8919 Implement NOWAIT and SKIP LOCKED # +# Range locking cannot support SKIP LOCKED? (TODO: but can support NOWAIT) +--source suite/rocksdb/include/not_range_locking.inc + --let $engine=ROCKSDB --source include/skip_locked_nowait.inc diff --git a/mysql-test/suite/rocksdb/t/trx_info.test b/mysql-test/suite/rocksdb/t/trx_info.test index 975bed6132c8..009c0ce67d49 100644 --- a/mysql-test/suite/rocksdb/t/trx_info.test +++ b/mysql-test/suite/rocksdb/t/trx_info.test @@ -11,7 +11,11 @@ insert into t1 values (2); set autocommit=0; select * from t1 for update; ---replace_column 1 _TRX_ID_ 3 _NAME_ 7 _KEY_ 14 _THREAD_ID_ +--replace_column 1 _TRX_ID_ 3 _NAME_ 5 2_or_3 7 _KEY_ 14 _THREAD_ID_ select * from information_schema.rocksdb_trx; +select + if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT +from information_schema.rocksdb_trx; + DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/t/unique_check.test b/mysql-test/suite/rocksdb/t/unique_check.test index 47ca74d0e5eb..9814d89448d8 100644 --- a/mysql-test/suite/rocksdb/t/unique_check.test +++ b/mysql-test/suite/rocksdb/t/unique_check.test @@ -2,6 +2,11 @@ --source include/have_debug_sync.inc --source include/count_sessions.inc +# Doesn't work with range locking because lock tree waits do not set +# state="Waiting for row lock" in I_S.PROCESSLIST. See MDEV-17873 for +# details. +--source suite/rocksdb/include/not_range_locking.inc + # For GitHub issue#167 -- Unique key check doesn't work connect (con1, localhost, root,,); diff --git a/mysql-test/suite/rocksdb/t/unique_sec.inc b/mysql-test/suite/rocksdb/t/unique_sec.inc index ce0bb1e39a90..508816e6ace1 100644 --- a/mysql-test/suite/rocksdb/t/unique_sec.inc +++ b/mysql-test/suite/rocksdb/t/unique_sec.inc @@ -144,8 +144,16 @@ UPDATE t1 SET id5=37 WHERE id1=38; UPDATE t1 SET id5=34 WHERE id1=38; --echo # NULL values are unique +--echo # (Note: the following UPDATE reads through the whole table without +--echo # finding anything to update. With point locking, this is fine, +--echo # but with range locking it will time out while waiting on a row lock +--echo # that the other transaction is holding) +if (`select @@rocksdb_use_range_locking=0`) { UPDATE t1 SET id5=NULL WHERE value1 > 37; - +} +if (`select @@rocksdb_use_range_locking=1`) { +-- echo UPDATE t1 SET id5=NULL WHERE value1 > 37; +} connection con1; COMMIT; diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test index fbebfeac85a7..0d8a35a1321c 100644 --- a/mysql-test/suite/rocksdb/t/varbinary_format.test +++ b/mysql-test/suite/rocksdb/t/varbinary_format.test @@ -1,6 +1,10 @@ --source include/have_debug.inc --source include/have_rocksdb.inc +# The test uses SELECT .. FOR UPDATE and examines which locks it acquires +# Range Locking will use different locks from point locking +--source suite/rocksdb/include/not_range_locking.inc + # Create a table with a varbinary key with the current format and validate # that it sorts correctly CREATE TABLE t1( diff --git a/mysql-test/suite/rocksdb/t/varchar_format.test b/mysql-test/suite/rocksdb/t/varchar_format.test index 3ea1a1a60b31..985b2c0c8e7d 100644 --- a/mysql-test/suite/rocksdb/t/varchar_format.test +++ b/mysql-test/suite/rocksdb/t/varchar_format.test @@ -1,6 +1,8 @@ --source include/have_debug.inc --source include/have_rocksdb.inc +--source suite/rocksdb/include/not_range_locking.inc + #################### # Create a table with a varchar key with the current format and validate # that it sorts correctly diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result new file mode 100644 index 000000000000..614737fcfbc6 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444; +ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_lock_manager_as_point_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_lock_manager_as_point_basic.result new file mode 100644 index 000000000000..b2c00bccf799 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_lock_manager_as_point_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT = 444; +ERROR HY000: Variable 'rocksdb_use_range_lock_manager_as_point' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result new file mode 100644 index 000000000000..614737fcfbc6 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444; +ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test new file mode 100644 index 000000000000..ee185aba6600 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCKING +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_lock_manager_as_point_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_lock_manager_as_point_basic.test new file mode 100644 index 000000000000..ee6eda473fd4 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_lock_manager_as_point_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test new file mode 100644 index 000000000000..ee185aba6600 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCKING +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index cff37465aa68..dc22125c8967 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -137,6 +137,7 @@ SET(ROCKSDB_SOURCES logger.h rdb_datadic.cc rdb_datadic.h rdb_iterator.cc rdb_iterator.h + rdb_locking_iter.cc rdb_locking_iter.h rdb_cf_options.cc rdb_cf_options.h rdb_cf_manager.cc rdb_cf_manager.h rdb_converter.cc rdb_converter.h diff --git a/storage/rocksdb/get_rocksdb_files.sh b/storage/rocksdb/get_rocksdb_files.sh index c9d3aa14b364..732a39442f7e 100755 --- a/storage/rocksdb/get_rocksdb_files.sh +++ b/storage/rocksdb/get_rocksdb_files.sh @@ -4,7 +4,7 @@ MKFILE=`mktemp` # include rocksdb make file relative to the path of this script echo "include rocksdb/src.mk all: - @echo \$(LIB_SOURCES)" > $MKFILE + @echo \$(LIB_SOURCES) \$(RANGE_TREE_SOURCES)" > $MKFILE for f in `make --makefile $MKFILE` do echo ../../rocksdb/$f diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index b20d62bc05f6..f07e2078a90d 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -104,6 +104,8 @@ #include "./ObjectFactory.h" #endif +#include "./rdb_locking_iter.h" + // Internal MySQL APIs not exposed in any header. extern "C" { /** @@ -667,6 +669,9 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct SYS_VAR *var, static void rocksdb_set_max_latest_deadlocks(THD *thd, struct SYS_VAR *var, void *var_ptr, const void *save); +static void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR *var, + void *var_ptr, const void *save); + static void rdb_set_collation_exception_list(const char *exception_list); static void rocksdb_set_collation_exception_list(THD *thd, struct SYS_VAR *var, void *var_ptr, @@ -823,6 +828,16 @@ static bool rocksdb_disable_instant_ddl = false; bool rocksdb_enable_tmp_table = false; bool rocksdb_enable_delete_range_for_drop_index = false; +// Range Locking: how much memory can be used used for the lock data structure +// (which hold the locks acquired by all clients). +static ulonglong rocksdb_max_lock_memory; + +bool rocksdb_use_range_locking = 0; +static bool rocksdb_use_range_lock_manager_as_point = 0; +std::shared_ptr range_lock_mgr; + +std::shared_ptr range_lock_mgr_used_as_point; + static std::atomic rocksdb_row_lock_deadlocks(0); static std::atomic rocksdb_row_lock_wait_timeouts(0); static std::atomic rocksdb_snapshot_conflict_errors(0); @@ -1524,6 +1539,13 @@ static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks, nullptr, rocksdb_set_max_latest_deadlocks, rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0); +static MYSQL_SYSVAR_ULONGLONG( + max_lock_memory, rocksdb_max_lock_memory, PLUGIN_VAR_RQCMDARG, + "Range-locking mode: Maximum amount of memory " + "that locks from all transactions can use at a time", + nullptr, rocksdb_set_max_lock_memory, + /*initial*/ 1073741824, 0, UINT64_MAX, 0); + static MYSQL_SYSVAR_ENUM( info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG, "Filter level for info logs to be written mysqld error log. " @@ -2365,6 +2387,17 @@ static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan, rocksdb_update_table_stats_use_table_scan, rocksdb_table_stats_use_table_scan); +static MYSQL_SYSVAR_BOOL(use_range_locking, rocksdb_use_range_locking, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Use Range Locking", nullptr, nullptr, + rocksdb_use_range_locking); + +static MYSQL_SYSVAR_BOOL(use_range_lock_manager_as_point, + rocksdb_use_range_lock_manager_as_point, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Use Range Lock Manager as point", nullptr, nullptr, + rocksdb_use_range_lock_manager_as_point); + static MYSQL_SYSVAR_BOOL( large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG, "Support large index prefix length of 3072 bytes. If off, the maximum " @@ -2706,7 +2739,9 @@ static struct SYS_VAR *rocksdb_system_variables[] = { MYSQL_SYSVAR(manual_compaction_threads), MYSQL_SYSVAR(manual_compaction_bottommost_level), MYSQL_SYSVAR(rollback_on_timeout), - + MYSQL_SYSVAR(use_range_locking), + MYSQL_SYSVAR(use_range_lock_manager_as_point), + MYSQL_SYSVAR(max_lock_memory), MYSQL_SYSVAR(enable_insert_with_update_caching), MYSQL_SYSVAR(trace_block_cache_access), MYSQL_SYSVAR(trace_queries), @@ -2935,6 +2970,7 @@ class Rdb_transaction { ulonglong m_update_count = 0; ulonglong m_delete_count = 0; // per row data + // (with range locking, each locked range is counted here, too) ulonglong m_row_lock_count = 0; std::unordered_map m_auto_incr_map; @@ -3008,8 +3044,50 @@ class Rdb_transaction { virtual rocksdb::Status do_pop_savepoint() = 0; virtual void do_rollback_to_savepoint() = 0; + private: + /* + If true, the current statement should not use a snapshot for reading. + Note that in a multi-statement transaction, the snapshot may have been + allocated by another statement. + */ + bool m_stmt_ignores_snapshot = false; + + /* Snapshot-ignore mode will put away m_reads_opts.snapshot here: */ + const rocksdb::Snapshot *m_saved_snapshot = nullptr; + public: + void start_ignore_snapshot() { + // note: this may be called several times for the same statement + if (!m_stmt_ignores_snapshot) { + m_saved_snapshot = m_read_opts.snapshot; + m_read_opts.snapshot = nullptr; + m_stmt_ignores_snapshot = true; + + // For repeatable-read AUTOCOMMIT statements on systems using range + // locking and ttl, there is a possibility no snapshot and + // m_snapshot_time assigned. Compaction relies on the oldest snapshot + // time found in rocksdb to determine what is visibile to transactions, + // so it's possible for compaction to remove rows that were already read + // by an autocomit statement (e.g. insert into .. select union select) + // since no snapshot was created to hold them. + // + // There will be further fixes for TTL which should resolve this case. + if (!m_snapshot_timestamp) + rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); + } + } + + void end_ignore_snapshot_if_needed() { + if (m_stmt_ignores_snapshot) { + m_stmt_ignores_snapshot = false; + m_read_opts.snapshot = m_saved_snapshot; + m_saved_snapshot = nullptr; + } + } + bool in_snapshot_ignore_mode() const { return m_stmt_ignores_snapshot; } + rocksdb::ReadOptions m_read_opts; + const char *m_mysql_log_file_name; my_off_t m_mysql_log_offset; const char *m_mysql_gtid; @@ -3190,6 +3268,18 @@ class Rdb_transaction { virtual void release_lock(const Rdb_key_def &key_descr, const std::string &rowkey, bool force = false) = 0; + virtual rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf, + const rocksdb::Endpoint &start, + const rocksdb::Endpoint &end) = 0; + + rocksdb::Status lock_singlepoint_range(rocksdb::ColumnFamilyHandle *const cf, + const rocksdb::Slice &point) { + // Normally, one needs to "flip" the endpoint type for reverse-ordered CFs. + // But here we are locking just one point so this is not necessary. + rocksdb::Endpoint endp(point, false); + return lock_range(cf, endp, endp); + } + virtual bool prepare() = 0; bool commit_or_rollback() { @@ -3248,10 +3338,17 @@ class Rdb_transaction { m_is_delayed_snapshot = false; } + void locking_iter_created() { + if (!m_snapshot_timestamp) + rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); + } + virtual void acquire_snapshot(bool acquire_now) = 0; virtual void release_snapshot() = 0; - bool has_snapshot() const { return m_read_opts.snapshot != nullptr; } + bool has_snapshot() const { + return m_read_opts.snapshot != nullptr || m_saved_snapshot; + } private: // The Rdb_sst_info structures we are currently loading. In a partitioned @@ -3901,7 +3998,9 @@ class Rdb_transaction { virtual rocksdb::Iterator *get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *column_family) = 0; + rocksdb::ColumnFamilyHandle *column_family, + const std::shared_ptr &, + bool use_locking_iterator = false) = 0; virtual void multi_get(rocksdb::ColumnFamilyHandle *const column_family, const size_t num_keys, const rocksdb::Slice *keys, @@ -3910,10 +4009,11 @@ class Rdb_transaction { const bool sorted_input) const = 0; rocksdb::Iterator *get_iterator( - rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter, + rocksdb::ColumnFamilyHandle *const column_family, + const std::shared_ptr &kd, bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false, - bool create_snapshot = true) { + bool create_snapshot = true, bool use_locking_iterator = false) { // Make sure we are not doing both read_current (which implies we don't // want a snapshot) and create_snapshot which makes sure we create // a snapshot @@ -3943,12 +4043,13 @@ class Rdb_transaction { if (read_current) { options.snapshot = nullptr; } - return get_iterator(options, column_family); + return get_iterator(options, column_family, kd, use_locking_iterator); } virtual bool is_tx_started() const = 0; virtual void start_tx() = 0; - virtual void start_stmt() = 0; + virtual void start_stmt(bool is_dml_statement) = 0; + virtual void start_autocommit_stmt(bool /*is_dml_statement*/) {} virtual void set_name() = 0; protected: @@ -4146,6 +4247,14 @@ class Rdb_transaction_impl : public Rdb_transaction { virtual bool is_writebatch_trx() const override { return false; } + // Lock the range between two specified endpoints + rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf, + const rocksdb::Endpoint &start_endp, + const rocksdb::Endpoint &end_endp) override { + ++m_row_lock_count; + return m_rocksdb_tx->GetRangeLock(cf, start_endp, end_endp); + } + private: void release_tx(void) { // We are done with the current active transaction object. Preserve it @@ -4252,7 +4361,7 @@ class Rdb_transaction_impl : public Rdb_transaction { } void acquire_snapshot(bool acquire_now) override { - if (m_read_opts.snapshot == nullptr) { + if (m_read_opts.snapshot == nullptr && !in_snapshot_ignore_mode()) { const auto thd_ss = std::static_pointer_cast( m_thd->get_explicit_snapshot()); if (thd_ss) { @@ -4422,9 +4531,16 @@ class Rdb_transaction_impl : public Rdb_transaction { rocksdb::Iterator *get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *const column_family) override { + rocksdb::ColumnFamilyHandle *const column_family, + const std::shared_ptr &kd, + bool use_locking_iterator) override { global_stats.queries[QUERIES_RANGE].inc(); - return m_rocksdb_tx->GetIterator(options, column_family); + if (use_locking_iterator) { + locking_iter_created(); + return GetLockingIterator(m_rocksdb_tx, options, column_family, kd, + &m_row_lock_count); + } else + return m_rocksdb_tx->GetIterator(options, column_family); } const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; } @@ -4498,17 +4614,34 @@ class Rdb_transaction_impl : public Rdb_transaction { /* Start a statement inside a multi-statement transaction. - @todo: are we sure this is called once (and not several times) per - statement start? + @note: If a statement uses N tables, this function will be called N times, + for each TABLE object that is used. For hooking to start of statement that is its own transaction, see ha_rocksdb::external_lock(). */ - void start_stmt() override { + void start_stmt(bool is_dml_statement) override { + if (rocksdb_use_range_locking && is_dml_statement) { + /* + In Range Locking mode, RocksDB does not do "key tracking". + Use InnoDB-like concurrency mode: make the DML statements always read + the latest data (instead of using transaction's snapshot). + This "downgrades" the transaction isolation to READ-COMMITTED on the + primary, but in return the actions can be replayed on the replica. + */ + start_ignore_snapshot(); + } + // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation) acquire_snapshot(false); } + void start_autocommit_stmt(bool is_dml_statement) override { + if (rocksdb_use_range_locking && is_dml_statement) { + start_ignore_snapshot(); + } + } + /* This must be called when last statement is rolled back, but the transaction continues @@ -4637,6 +4770,12 @@ class Rdb_writebatch_impl : public Rdb_transaction { // Nothing to do here since we don't hold any row locks. } + rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const, + const rocksdb::Endpoint &, + const rocksdb::Endpoint &) override { + return rocksdb::Status::OK(); + } + void rollback() override { on_rollback(); m_write_count = 0; @@ -4736,7 +4875,9 @@ class Rdb_writebatch_impl : public Rdb_transaction { rocksdb::Iterator *get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *const /* column_family */) override { + rocksdb::ColumnFamilyHandle *const /* column_family */, + const std::shared_ptr &, + bool /*use_locking_iterator*/) override { const auto it = rdb->NewIterator(options); return m_batch->NewIteratorWithBase(it); } @@ -4754,10 +4895,9 @@ class Rdb_writebatch_impl : public Rdb_transaction { set_initial_savepoint(); } + void start_stmt(bool /*is_dml_statement*/) override {} void set_name() override {} - void start_stmt() override {} - void rollback_stmt() override { if (m_batch) rollback_to_stmt_savepoint(); } @@ -5080,6 +5220,7 @@ static int rocksdb_prepare(handlerton *const hton MY_ATTRIBUTE((__unused__)), DEBUG_SYNC(thd, "rocksdb.prepared"); } else { tx->make_stmt_savepoint_permanent(); + tx->end_ignore_snapshot_if_needed(); } return HA_EXIT_SUCCESS; @@ -5299,6 +5440,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)), - For a COMMIT statement that finishes a multi-statement transaction - For a statement that has its own transaction */ + tx->end_ignore_snapshot_if_needed(); if (tx->commit()) { DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED); } @@ -5308,6 +5450,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)), */ tx->set_tx_failed(false); tx->make_stmt_savepoint_permanent(); + tx->end_ignore_snapshot_if_needed(); } if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { @@ -5345,6 +5488,7 @@ static int rocksdb_rollback(handlerton *const hton MY_ATTRIBUTE((__unused__)), - a statement inside a transaction is rolled back */ + tx->end_ignore_snapshot_if_needed(); tx->rollback_stmt(); tx->set_tx_failed(true); } @@ -5449,8 +5593,9 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { "=========================================\n"; } + template static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info( - const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) { + const PathStruct &txn, const GL_INDEX_ID &gl_index_id) { Rdb_deadlock_info::Rdb_dl_trx_info txn_data; txn_data.trx_id = txn.m_txn_id; @@ -5475,23 +5620,48 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { ? cfh->GetName() : "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id); - txn_data.waiting_key = - rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length()); + txn_data.waiting_key = format_wait_key(txn); txn_data.exclusive_lock = txn.m_exclusive; return txn_data; } - static Rdb_deadlock_info get_dl_path_trx_info( - const rocksdb::DeadlockPath &path_entry) { + // Get the key to use to find the index number (and then, index name) + // Two functions with matching signatures so get_dl_path_trx_info() template + // can be used with both point and range locking. + static const std::string &get_key_for_indexnr( + const rocksdb::DeadlockInfo &info) { + return info.m_waiting_key; + } + static const std::string &get_key_for_indexnr( + const rocksdb::RangeDeadlockInfo &info) { + // Range locks do not span across indexes, so take the left bound + return info.m_start.slice; + } + + // Print the locked key (or range) in hex + // Two functions with matching signatures so get_dl_path_trx_info() template + // can be used with both point and range locking. + static std::string format_wait_key(const rocksdb::DeadlockInfo &info) { + return rdb_hexdump(info.m_waiting_key.c_str(), info.m_waiting_key.length()); + } + static std::string format_wait_key(const rocksdb::RangeDeadlockInfo &info) { + return rdb_hexdump_range(info.m_start, info.m_end); + } + + // Get deadlock path info. A templated function so one can use it with both + // point and range locking. + template + static Rdb_deadlock_info get_dl_path_trx_info(const PathStruct &path_entry) { Rdb_deadlock_info deadlock_info; for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) { const auto &txn = *it; + auto waiting_key = get_key_for_indexnr(txn); const GL_INDEX_ID gl_index_id = { txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast( - txn.m_waiting_key.c_str()))}; + waiting_key.c_str()))}; deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id)); } DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty()); @@ -5516,7 +5686,7 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { /* Calculate the duration the snapshot has existed */ int64_t snapshot_timestamp = tx->m_snapshot_timestamp; - if (snapshot_timestamp != 0) { + if (snapshot_timestamp != 0 && tx->has_snapshot()) { int64_t curr_time; rdb->GetEnv()->GetCurrentTime(&curr_time); @@ -5535,8 +5705,8 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { } } - void populate_deadlock_buffer() { - auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + template + void populate_deadlock_buffer_tmpl(PathStruct &dlock_buffer) { m_data += "----------LATEST DETECTED DEADLOCKS----------\n"; for (const auto &path_entry : dlock_buffer) { @@ -5576,12 +5746,32 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { } } + void populate_deadlock_buffer() { + if (range_lock_mgr) { + auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer(); + populate_deadlock_buffer_tmpl(dlock_buffer); + } else { + auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + populate_deadlock_buffer_tmpl(dlock_buffer); + } + } + std::vector get_deadlock_info() { std::vector deadlock_info; - auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); - for (const auto &path_entry : dlock_buffer) { - if (!path_entry.limit_exceeded) { - deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + + if (range_lock_mgr) { + auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer(); + for (const auto &path_entry : dlock_buffer) { + if (!path_entry.limit_exceeded) { + deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + } + } + } else { + auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + for (const auto &path_entry : dlock_buffer) { + if (!path_entry.limit_exceeded) { + deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + } } } return deadlock_info; @@ -5997,9 +6187,12 @@ static bool rocksdb_collect_hton_log_info(handlerton *const /* unused */, return ret_val; } +/* + @param is_dml_statement If true, we are is a DML statement +*/ static inline void rocksdb_register_tx( handlerton *const hton MY_ATTRIBUTE((__unused__)), THD *const thd, - Rdb_transaction *const tx) { + Rdb_transaction *const tx, bool is_dml_stmt) { DBUG_ASSERT(tx != nullptr); trans_register_ha(thd, false, rocksdb_hton, NULL); @@ -6014,8 +6207,10 @@ static inline void rocksdb_register_tx( } } if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { - tx->start_stmt(); + tx->start_stmt(is_dml_stmt); trans_register_ha(thd, true, rocksdb_hton, NULL); + } else { + tx->start_autocommit_stmt(is_dml_stmt); } } @@ -6100,7 +6295,7 @@ static int rocksdb_start_tx_and_assign_read_view( DBUG_ASSERT(!tx->has_snapshot()); tx->set_tx_read_only(true); - rocksdb_register_tx(hton, thd, tx); + rocksdb_register_tx(hton, thd, tx, false); tx->acquire_snapshot(true); return HA_EXIT_SUCCESS; @@ -6157,7 +6352,7 @@ static int rocksdb_start_tx_with_shared_read_view( DBUG_ASSERT(!tx->has_snapshot()); tx->set_tx_read_only(true); - rocksdb_register_tx(hton, thd, tx); + rocksdb_register_tx(hton, thd, tx, false); tx->acquire_snapshot(true); // case: an explicit snapshot was not assigned to this transaction @@ -6364,6 +6559,24 @@ static void rocksdb_truncation_table_cleanup(void) { } } +/* + Range locking escalation barrier function + The bytes used for the index number are compared. Endpoints + that span indexes are not merged together. +*/ +static bool rocksdb_escalation_barrier(const rocksdb::Endpoint &a, + const rocksdb::Endpoint &b) { + DBUG_ASSERT(a.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE); + DBUG_ASSERT(b.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE); + if (a.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE && + b.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE && + memcmp(a.slice.data(), b.slice.data(), Rdb_key_def::INDEX_NUMBER_SIZE)) { + return true; // This is a barrier + } else { + return false; // No barrier + } +} + /* Storage Engine initialization function, invoked when plugin is loaded. */ @@ -6837,6 +7050,28 @@ static int rocksdb_init_internal(void *const p) { tx_db_options.write_policy = static_cast(rocksdb_write_policy); + if (rocksdb_use_range_locking && rocksdb_use_range_lock_manager_as_point) { + // rdb_log_status_error( + // status, "Can't have both range_locking and + // range_lock_manager_as_point"); + // DBUG_RETURN(HA_EXIT_FAILURE); + rocksdb_use_range_lock_manager_as_point = 0; + } + + if (rocksdb_use_range_locking) { + range_lock_mgr.reset( + rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory)); + tx_db_options.lock_mgr_handle = range_lock_mgr; + range_lock_mgr->SetEscalationBarrierFunc(rocksdb_escalation_barrier); + } + if (rocksdb_use_range_lock_manager_as_point) { + range_lock_mgr_used_as_point.reset( + rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory)); + tx_db_options.lock_mgr_handle = range_lock_mgr_used_as_point; + range_lock_mgr_used_as_point->SetEscalationBarrierFunc( + rocksdb_escalation_barrier); + } + status = check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr); @@ -6870,6 +7105,14 @@ static int rocksdb_init_internal(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + if (range_lock_mgr) { + range_lock_mgr->SetMaxLockMemory(rocksdb_max_lock_memory); + sql_print_information("RocksDB: USING RANGE LOCKING"); + sql_print_information("RocksDB: Max lock memory=%llu", + rocksdb_max_lock_memory); + } else + sql_print_information("RocksDB: USING POINT LOCKING"); + // NO_LINT_DEBUG sql_print_information("RocksDB:Init column families..."); if (st_rdb_exec_time.exec("cf_manager::init", [&]() { @@ -7601,6 +7844,7 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton, m_sk_packed_tuple_old(nullptr), m_pack_buffer(nullptr), m_lock_rows(RDB_LOCK_NONE), + m_use_range_locking(false), m_keyread_only(false), m_iteration_only(false), m_insert_with_update(false), @@ -7929,6 +8173,7 @@ int ha_rocksdb::open(const char *const name, } m_lock_rows = RDB_LOCK_NONE; + m_use_range_locking = false; m_locked_row_action = THR_WAIT; m_key_descr_arr = m_tbl_def->m_key_descr_arr; @@ -9508,6 +9753,14 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key, } Rdb_transaction *const tx = get_or_create_tx(table->in_use); + + bool use_locking_iter = false; + + if ((rc = set_range_lock(tx, kd, find_flag, slice, end_range, + &use_locking_iter))) + DBUG_RETURN(rc); + if (use_locking_iter) m_iterator->set_use_locking(); + const bool is_new_snapshot = !tx->has_snapshot(); // Loop as long as we get a deadlock error AND we end up creating the @@ -9556,6 +9809,247 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key, DBUG_RETURN(rc); } +/* + @brief + Compute the range lock endpoints and set the range lock, if necessary + + @param use_locking_iter OUT If true, locks are not set and LockingIterator + should be used instead + + @detail + If the scanned range doesn't have the endpoint we're scanning towards, + don't set the lock, it will be too coarse. Indicate that LockingIterator + should be used, instead. + + == RangeFlagsShouldBeFlippedForRevCF == + When using reverse column families, the value of Endpoint::inf_suffix has + the reverse meaning. + + Let's consider a forward-ordered CF and some keys and endpoints in it: + + key=a, inf_suffix=false + key=ab + key=az + key=a, inf_suffix=true + + Now, let's put the same data and endpoints into a reverse-ordered CF. The + physical order of the data will be the reverse of the above: + + key=a, inf_suffix=true + key=az + key=ab + key=a, inf_suffix=false + + Note that inf_suffix=false comes *before* any values with the same prefix. + And inf_suffix=true comes *after* all values with the same prefix. + + The Endpoint comparison function in RocksDB doesn't "know" if the CF is + reverse-ordered or not. It uses the Key Comparator for key values, and + then it assumes that Endpoint(key=$VAL, inf_suffix=false) comes before + the row with key=$VAL. + + The only way to achieve the required ordering is to flip the endpoint + flag value before passing class Endpoint to RocksDB. This function does + it before the lock_range() call. + + @return + 0 Ok + Other Error acquiring the lock (wait timeout, deadlock, etc) +*/ + +int ha_rocksdb::set_range_lock(Rdb_transaction *tx, const Rdb_key_def &kd, + const enum ha_rkey_function &find_flag, + const rocksdb::Slice &slice_arg, + const key_range *const end_key, + bool *use_locking_iterator) { + rocksdb::Slice end_slice; + uchar end_slice_buf[MAX_KEY_LENGTH]; + bool start_has_inf_suffix = false, end_has_inf_suffix = false; + rocksdb::Slice slice(slice_arg); + *use_locking_iterator = false; + + if (!m_use_range_locking) { + return 0; + } + bool big_range = false; + + /* + The 'slice' has the left endpoint of the range to lock. + Figure out the right endpoint. + */ + + if (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST) { + if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) { + // This is a full table/index scan + // (in case of HA_READ_PREFIX_LAST, a reverse-ordered one) + start_has_inf_suffix = false; + big_range = true; + } else { + /* + HA_READ_KEY_EXACT: + This is "key_part= const" interval. We need to lock this range: + (lookup_value, -inf) < key < (lookup_value, +inf) + HA_READ_PREFIX_LAST: + We get here for queries like: + + select * from t1 where pk1=const order by pk1 desc for update + + assuming this uses an index on (pk1, ...). + We get end_key=nullptr. + */ + start_has_inf_suffix = false; + end_has_inf_suffix = true; + end_slice = slice; + } + } else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) { + /* + We get here for queries like: + + select * from t1 where pk1=const1 and pk2 between const2 and const3 + order by pk1 desc + for update + + assuming this uses an index on (pk1, pk2). + The slice has the right endpoint: {const1, const3} + the end_key has the left endpoint: {const1, const2}. + */ + + // Move the right endpoint from slice to end_slice + end_slice = slice; + + // Pack the left endpoint and make "slice" point to it + uchar pack_buffer[MAX_KEY_LENGTH]; + uint end_slice_size = kd.pack_index_tuple( + table, pack_buffer, end_slice_buf, end_key->key, end_key->keypart_map); + slice = + rocksdb::Slice(reinterpret_cast(end_slice_buf), end_slice_size); + start_has_inf_suffix = false; + end_has_inf_suffix = true; + } else if (find_flag == HA_READ_BEFORE_KEY) { + /* + We get here for queries like + select * from t1 + where pk <1007 order by pk desc limit 2 for update + select * from t1 + where pk >=800 and pk <1007 order by pk desc limit 2 for update + */ + + // Move the right endpoint from slice to end_slice + end_slice = slice; + + if (end_key) { + uchar pack_buffer[MAX_KEY_LENGTH]; + uint end_slice_size = + kd.pack_index_tuple(table, pack_buffer, end_slice_buf, end_key->key, + end_key->keypart_map); + + slice = rocksdb::Slice(reinterpret_cast(end_slice_buf), + end_slice_size); + + // end_has_inf_suffix is false, because we're looking keyflag == HA_READ_AFTER_KEY) { + // this is "key_part <= const". + end_has_inf_suffix = true; + } else if (end_key->flag == HA_READ_BEFORE_KEY) { + // this is "key_part < const", non-inclusive. + end_has_inf_suffix = false; + } else { + // Unknown type of range, shouldn't happen + DBUG_ASSERT(0); + big_range = true; + } + + uchar pack_buffer[MAX_KEY_LENGTH]; + uint end_slice_size = kd.pack_index_tuple( + table, pack_buffer, end_slice_buf, end_key->key, end_key->keypart_map); + + end_slice = + rocksdb::Slice(reinterpret_cast(end_slice_buf), end_slice_size); + } else { + big_range = true; +#if 0 + // The below is code to handle this without LockingIterator: + // No end key + // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY + if (find_flag == HA_READ_KEY_OR_NEXT) + start_has_inf_suffix= false; + else if (find_flag == HA_READ_AFTER_KEY) + start_has_inf_suffix= true; + else + DBUG_ASSERT(0); + + uint end_slice_size; + kd.get_infimum_key(end_slice_buf, &end_slice_size); + end_slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size); + end_has_inf_suffix= true; +#endif + } + + if (big_range) { + *use_locking_iterator = true; + return 0; + } + + rocksdb::Endpoint start_endp; + rocksdb::Endpoint end_endp; + + if (kd.m_is_reverse_cf) { + // Flip the endpoint flag values, as explained in the + // RangeFlagsShouldBeFlippedForRevCF comment above. + start_endp = rocksdb::Endpoint(end_slice, !end_has_inf_suffix); + end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix); + } else { + start_endp = rocksdb::Endpoint(slice, start_has_inf_suffix); + end_endp = rocksdb::Endpoint(end_slice, end_has_inf_suffix); + } + + /* + RocksDB's iterator is reading the snapshot of the data that was taken at + the time the iterator was created. + + After we've got a lock on the range, we'll need to refresh the iterator + to read the latest contents. (If we use the iterator created before the + lock_range() call, we may miss the changes that were made/committed after + the iterator was created but before the lock_range() call was made). + + RocksDB has Iterator::Refresh() method, but alas, it is not implemented for + the iterator returned by Transaction object (Transaction object returns + BaseDeltaIterator which allows one to see the transactions's own changes). + + Our solution to this is to release the iterator and create the new one. + We release it here, it will be created as soon as there's a need to read + records. + */ + m_iterator->reset(); + + auto s = tx->lock_range(kd.get_cf(), start_endp, end_endp); + if (!s.ok()) { + return ( + tx->set_status_error(table->in_use, s, kd, m_tbl_def, m_table_handler)); + } + return 0; +} + /* See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL index navigation commands are converted into RocksDB lookup commands. @@ -10070,7 +10564,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf, } } - if (rc) { + if (rc != HA_EXIT_SUCCESS) { break; } @@ -10081,7 +10575,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf, table->m_status = 0; rc = 0; } else if (active_index == table->s->primary_key) { - if (m_lock_rows != RDB_LOCK_NONE) { + if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) { DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete"); /* We need to put a lock and re-read */ bool skip_row = false; @@ -10281,7 +10775,9 @@ void ha_rocksdb::unlock_row() { DBUG_VOID_RETURN; } - if (m_lock_rows != RDB_LOCK_NONE) { + // Don't release the lock when using range locking. + // This breaks m_row_lock_count + if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) { Rdb_transaction *const tx = get_or_create_tx(table->in_use); tx->release_lock(*m_pk_descr, std::string(m_last_rowkey.ptr(), m_last_rowkey.length())); @@ -10775,6 +11271,8 @@ int ha_rocksdb::check_and_lock_sk( lock for this key. */ if (!(key_info->flags & HA_NOSAME)) { + if (rocksdb_use_range_locking) + return check_and_lock_non_unique_sk(key_id, row_info); return HA_EXIT_SUCCESS; } @@ -10891,6 +11389,53 @@ int ha_rocksdb::check_and_lock_sk( return rc; } +/** + @brief + Lock the non-unique sk for range locking +*/ +int ha_rocksdb::check_and_lock_non_unique_sk( + const uint key_id, const struct update_row_info &row_info) { + DBUG_ASSERT(rocksdb_use_range_locking); + const Rdb_key_def &kd = *m_key_descr_arr[key_id]; + bool store_row_debug_checksums = should_store_row_debug_checksums(); + + if (row_info.old_data != nullptr) { + rocksdb::Slice old_key_slice; + int old_packed_size; + + old_packed_size = kd.pack_record( + table, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old, + &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0, + nullptr, m_ttl_bytes); + + old_key_slice = rocksdb::Slice( + reinterpret_cast(m_sk_packed_tuple_old), old_packed_size); + + auto s = row_info.tx->lock_singlepoint_range(kd.get_cf(), old_key_slice); + if (!s.ok()) { + return (row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler)); + } + } + + int new_packed_size; + rocksdb::Slice new_key_slice; + rocksdb::Slice new_value_slice; + new_packed_size = kd.pack_record( + table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple, &m_sk_tails, + 0, row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes); + new_key_slice = rocksdb::Slice( + reinterpret_cast(m_sk_packed_tuple), new_packed_size); + + auto s = row_info.tx->lock_singlepoint_range(kd.get_cf(), new_key_slice); + if (!s.ok()) { + return (row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler)); + } + + return HA_EXIT_SUCCESS; +} + /** Enumerate all keys to check their uniquess and also lock it @@ -11692,6 +12237,19 @@ int ha_rocksdb::delete_row(const uchar *const buf) { nullptr, false, hidden_pk_id); rocksdb::Slice secondary_key_slice( reinterpret_cast(m_sk_packed_tuple), packed_size); + + /* + For point locking, Deleting on secondary key doesn't need any locks. + Range locking must get a lock. + */ + if (rocksdb_use_range_locking) { + auto s = tx->lock_singlepoint_range(kd.get_cf(), secondary_key_slice); + if (!s.ok()) { + DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler)); + } + } + tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(), secondary_key_slice); bytes_written += secondary_key_slice.size(); @@ -12363,8 +12921,14 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { } } tx->m_n_mysql_tables_in_use++; - rocksdb_register_tx(rocksdb_hton, thd, tx); + rocksdb_register_tx(rocksdb_hton, thd, tx, (m_lock_rows != RDB_LOCK_NONE)); tx->io_perf_start(&m_io_perf); + + m_use_range_locking = false; + if (rocksdb_use_range_locking && m_lock_rows != RDB_LOCK_NONE && + my_core::thd_tx_isolation(thd) >= ISO_REPEATABLE_READ) { + m_use_range_locking = true; + } } DBUG_RETURN(res); @@ -12391,7 +12955,7 @@ int ha_rocksdb::start_stmt(THD *const thd, Rdb_transaction *const tx = get_or_create_tx(thd); read_thd_vars(thd); - rocksdb_register_tx(ht, thd, tx); + rocksdb_register_tx(ht, thd, tx, (m_lock_rows != RDB_LOCK_NONE)); tx->io_perf_start(&m_io_perf); DBUG_RETURN(HA_EXIT_SUCCESS); @@ -15078,6 +15642,34 @@ static int show_rocksdb_stall_vars(THD *thd MY_ATTRIBUTE((unused)), return 0; } +// +// Lock Tree Status variables +// +static longlong rocksdb_locktree_escalation_count = 1234; +static longlong rocksdb_locktree_current_lock_memory = 0; + +static SHOW_VAR rocksdb_locktree_status_variables[] = { + DEF_STATUS_VAR_FUNC("escalation_count", &rocksdb_locktree_escalation_count, + SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("current_lock_memory", + &rocksdb_locktree_current_lock_memory, SHOW_LONGLONG), + // end of the array marker + {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}}; + +static SHOW_VAR rocksdb_empty_status_variables[] = { + {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}}; + +static void show_rocksdb_locktree_vars(THD *, SHOW_VAR *var, char *) { + var->type = SHOW_ARRAY; + if (range_lock_mgr) { + auto status = range_lock_mgr->GetStatus(); + rocksdb_locktree_escalation_count = status.escalation_count; + rocksdb_locktree_current_lock_memory = status.current_lock_memory; + var->value = reinterpret_cast(&rocksdb_locktree_status_variables); + } else + var->value = reinterpret_cast(&rocksdb_empty_status_variables); +} + static SHOW_VAR rocksdb_status_vars[] = { DEF_STATUS_VAR(block_cache_miss), DEF_STATUS_VAR(block_cache_hit), @@ -15207,6 +15799,8 @@ static SHOW_VAR rocksdb_status_vars[] = { SHOW_SCOPE_GLOBAL}, {"rocksdb_stall", reinterpret_cast(&show_rocksdb_stall_vars), SHOW_FUNC, SHOW_SCOPE_GLOBAL}, + {"rocksdb_locktree", reinterpret_cast(show_rocksdb_locktree_vars), + SHOW_FUNC, SHOW_SCOPE_GLOBAL}, {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}}; /* @@ -16176,6 +16770,23 @@ static void rocksdb_set_delayed_write_rate( RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +static void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR *, + void * /*var_ptr*/, const void *save) { + const uint64_t new_val = *static_cast(save); + if (rocksdb_max_lock_memory != new_val) { + if (range_lock_mgr->SetMaxLockMemory(new_val)) { + /* NO_LINT_DEBUG */ + sql_print_warning("MyRocks: failed to set max_lock_memory"); + push_warning_printf( + thd, Sql_condition::SL_WARNING, ER_ERROR_WHEN_EXECUTING_COMMAND, + "Cannot set max_lock_memory to size below currently used"); + } else { + // Succeeded + rocksdb_max_lock_memory = new_val; + } + } +} + static void rocksdb_set_max_latest_deadlocks( THD *thd MY_ATTRIBUTE((unused)), struct SYS_VAR *var MY_ATTRIBUTE((unused)), void *var_ptr MY_ATTRIBUTE((unused)), const void *save) { @@ -16183,7 +16794,11 @@ static void rocksdb_set_max_latest_deadlocks( const uint32_t new_val = *static_cast(save); if (rocksdb_max_latest_deadlocks != new_val) { rocksdb_max_latest_deadlocks = new_val; - rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); + if (range_lock_mgr) { + auto n = rocksdb_max_latest_deadlocks; + range_lock_mgr->SetRangeDeadlockInfoBufferSize(n); + } else + rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); } RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } @@ -16899,11 +17514,23 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) { } rocksdb::Iterator *rdb_tx_get_iterator( - THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter, + Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family, + const std::shared_ptr &kd, bool skip_bloom_filter, + const rocksdb::Slice &lower_bound_slice, + const rocksdb::Slice &upper_bound_slice, bool read_current, + bool create_snapshot) { + return tx->get_iterator(column_family, kd, skip_bloom_filter, + lower_bound_slice, upper_bound_slice, read_current, + create_snapshot); +} + +rocksdb::Iterator *rdb_tx_get_iterator( + THD *thd, rocksdb::ColumnFamilyHandle *const cf, + const std::shared_ptr &kd, bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, - const rocksdb::Snapshot **snapshot, bool read_current, - bool create_snapshot) { + const rocksdb::Snapshot **snapshot, bool read_current, bool create_snapshot, + bool use_locking_iter) { if (commit_in_the_middle(thd)) { DBUG_ASSERT(snapshot && *snapshot == nullptr); if (snapshot) { @@ -16918,8 +17545,9 @@ rocksdb::Iterator *rdb_tx_get_iterator( } } else { Rdb_transaction *tx = get_tx_from_thd(thd); - return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound, - eq_cond_upper_bound, read_current, create_snapshot); + return tx->get_iterator(cf, kd, skip_bloom_filter, eq_cond_lower_bound, + eq_cond_upper_bound, read_current, create_snapshot, + use_locking_iter); } } @@ -16953,6 +17581,12 @@ void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd, tx->release_lock(kd, std::string(key.data(), key.size()), force); } +rocksdb::Status rdb_tx_lock_range(Rdb_transaction *tx, const Rdb_key_def &kd, + const rocksdb::Endpoint &start_key, + const rocksdb::Endpoint &end_key) { + return tx->lock_range(kd.get_cf(), start_key, end_key); +} + void rdb_tx_multi_get(Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family, const size_t num_keys, const rocksdb::Slice *keys, diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 57551e9647fb..c5bec683cf56 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -82,6 +82,9 @@ extern PSI_rwlock_key key_rwlock_read_free_rpl_tables; #endif extern Regex_list_handler rdb_read_free_regex_handler; static bool rocksdb_column_default_value_as_expression = true; + +extern bool rocksdb_use_range_locking; + /** @brief Rdb_table_handler is a reference-counted structure storing information for @@ -266,6 +269,8 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { /* Type of locking to apply to rows */ Rdb_lock_type m_lock_rows; + bool m_use_range_locking; + thr_locked_row_action m_locked_row_action; /* true means we're doing an index-only read. false means otherwise. */ @@ -330,6 +335,12 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { const Rdb_key_def &kd, const rocksdb::Slice &key) const; + int set_range_lock(Rdb_transaction *tx, const Rdb_key_def &kd, + const enum ha_rkey_function &find_flag, + const rocksdb::Slice &slice, + const key_range *const end_key, + bool *use_locking_iterator); + int get_row_by_rowid(uchar *const buf, const char *const rowid, const uint rowid_size, bool *skip_row = nullptr, const bool skip_lookup = false, @@ -786,6 +797,10 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { const struct update_row_info &row_info, bool *const found, const bool skip_unique_check) MY_ATTRIBUTE((__warn_unused_result__)); + + int check_and_lock_non_unique_sk(const uint key_id, + const struct update_row_info &row_info) + MY_ATTRIBUTE((__warn_unused_result__)); int check_uniqueness_and_lock(const struct update_row_info &row_info, bool pk_changed, const bool skip_unique_check) MY_ATTRIBUTE((__warn_unused_result__)); @@ -997,6 +1012,9 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { /* Need to build decoder on next read operation */ bool m_need_build_decoder; + + int iter_status_to_retval(rocksdb::Iterator *it, const Rdb_key_def &kd, + int not_found_code); }; /* @@ -1134,11 +1152,12 @@ Rdb_transaction *get_tx_from_thd(THD *const thd); const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx); rocksdb::Iterator *rdb_tx_get_iterator( - THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter, + THD *thd, rocksdb::ColumnFamilyHandle *const cf, + const std::shared_ptr &kd, bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, const rocksdb::Snapshot **snapshot, bool read_current = false, - bool create_snapshot = true); + bool create_snapshot = true, bool use_locking_iter = false); rocksdb::Status rdb_tx_get(Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family, @@ -1151,6 +1170,10 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx, rocksdb::PinnableSlice *const value, bool exclusive, bool skip_wait); +rocksdb::Status rdb_tx_lock_range(Rdb_transaction *tx, const Rdb_key_def &kd, + const rocksdb::Endpoint &start_key, + const rocksdb::Endpoint &end_key); + void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd, const rocksdb::Slice &key, bool force); @@ -1212,4 +1235,6 @@ extern std::atomic rocksdb_partial_index_rows_sorted; extern std::atomic rocksdb_partial_index_rows_materialized; extern bool rocksdb_enable_tmp_table; extern bool rocksdb_enable_delete_range_for_drop_index; + +extern std::shared_ptr range_lock_mgr; } // namespace myrocks diff --git a/storage/rocksdb/locking-iterator-partial-index.txt b/storage/rocksdb/locking-iterator-partial-index.txt new file mode 100644 index 000000000000..d3ffb41d8f00 --- /dev/null +++ b/storage/rocksdb/locking-iterator-partial-index.txt @@ -0,0 +1,120 @@ + +This document describes how Locking Reads are implemented with partial index +iterator (Rdb_iterator_partial) + +== Closed ranges == +ha_rocksdb will make the lock_range() call before making any reads. + +We just need to make sure that after that, the iterator reads the latest +committed data (not the data from the read snapshot). + +== Open ranges and LockingIterator == + +With Open Ranges, regular indexes use LockingIterator. + +How does one make Rdb_iterator_partial use the LockingIterator? + +=== Background info: Rdb_iterator_Partial === + +Partial iterator is used for partial secondary keys. + +PK and SK share the same key prefix (see 'partial_group_keyparts=N' parameter). + +For a given value of key prefix: + +* if the number of rows exceeds 'partial_group_threshold' parameter, one reads + the secondary index, as usual (the SK is "materialized" for this prefix) + +* if the number of rows is less than 'partial_group_threshold', then the SK is + "not materialized" and has no rows for the prefix. The rows are obtained by + reading the records using PK, buffering+sorting them in memory, and then + returning them. + +* Switching from non-materialized to materialized is done "lazily" on-read. + The write locks the group prefix in SK (see + Rdb_iterator_partial::materialize_prefix()) + +* Update/Delete operations also lock the group prefix in SK (see + ha_rocksdb::acquire_prefix_lock) + +* De-materialization is currently not performed. + +=== Regular reads with Rdb_iterator_partial === + +Regular (that is, non locking) reads are done as follows: + +Step #1: first, we need to figure out the key prefix we're going to read. +There are two possibilities: + +A. It can be inferred from the lookup key value. +B. It cannot be inferred (e.g. we scan from the start of the table). + In this case, we read from the PK to find out the first key prefix. + +See Rdb_iterator_partial::get_prefix_from_start(). + +Step #2 is to read rows within this prefix. + +We first try to read through the SK. If it has a row within the prefix, it +means this prefix is materialized and we continue to read from the SK within +the bounds of the key prefix. + +If the SK has no data we read all rows through the PK, buffer, sort them, and +return. (See Rdb_iterator_partial::read_prefix_from_pk()) + +Step #3: +When we have exhausted rows in this key prefix, we check if we need to continue +the scan. +If we do, we take the current prefix value and try to get the next row after +it using the approach in Step #1. See Rdb_iterator_partial::get_next_prefix(). + +=== Locking Reads with Rdb_iterator_partial === + +This section describes how one can perform locking reads for Steps 1-3 from the +previous section. + +Step #1: +for Case A, there's no need to lock anything. + +for case B, we need to lock the range from the lookup key to the start of +the first key prefix. This can be achieved by making m_iterator_pk use +a LockingIterator. + +(PK iterator uses a prefix of the lookup key, so we may end up with a +coarser-grained lock, but this is still correct). + +Step #2: + +Quoting the previous section: + +> We first try to read through the SK. +> If it has a row within the prefix, it means this prefix is materialized and we +> continue to read from the SK within the bounds of the key prefix. + +If we use a LockingIterator for scanning the SK, we are doing a locking read +and have achieved our goal. + +> If the SK has no data we read all rows through the PK + +Suppose we use a LockingIterator to try read through the SK. The read is done +with this call in Rdb_iterator_partial::seek_next_prefix: + + rc = Rdb_iterator_base::seek( + direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key, + false, empty_end_key); + +If the SK has no data for the cur_prefix_key, the LockingIterator will lock the +range before returning from this call. +Note that it will lock just this key prefix: this is achieved by use of iterator +bounds. + +If there is a Materialization operation in progress, the locking read will +eventually come into conflict with it. To avoid hitting the conflict at the +end, Rdb_iterator_partial::materialize_prefix() is made to acquire a lock +on the whole prefix when Range Locking is used. + +Step #3: The same approach as in step #1. If we use LockingIterator for the PK +it will correctly lock the gap between the prefixes (or just the gap after the +last prefix if there are no further prefixes). + +This way, one can provide LockingIterator semantics with Rdb_iterator_partial. + diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc index c2d09ca846f9..c51c233a57a6 100644 --- a/storage/rocksdb/nosql_access.cc +++ b/storage/rocksdb/nosql_access.cc @@ -1383,10 +1383,11 @@ class select_exec { } rocksdb::Iterator *get_iterator(rocksdb::ColumnFamilyHandle *cf, + const std::shared_ptr &kd, bool use_bloom, const rocksdb::Slice &lower_bound, const rocksdb::Slice &upper_bound) { - return rdb_tx_get_iterator(m_thd, cf, !use_bloom, lower_bound, + return rdb_tx_get_iterator(m_thd, cf, kd, !use_bloom, lower_bound, upper_bound, nullptr); } @@ -2220,6 +2221,7 @@ bool INLINE_ATTR select_exec::setup_iterator(THD *thd) { } else { m_iterator.reset( new Rdb_iterator_base(thd, m_key_def, m_pk_def, m_tbl_def)); + // m_lower_bound_slice, m_upper_bound_slice); } return m_iterator == nullptr; diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc index ca90a08cd75e..89e8fd74a35d 100644 --- a/storage/rocksdb/rdb_i_s.cc +++ b/storage/rocksdb/rdb_i_s.cc @@ -1995,6 +1995,59 @@ static ST_FIELD_INFO rdb_i_s_lock_info_fields_info[] = { ROCKSDB_FIELD_INFO("MODE", 32, MYSQL_TYPE_STRING, 0), ROCKSDB_FIELD_INFO_END}; +// Dump the locked key (or range) into a string. A template and its +// specializations +template +std::string dump_key(const LockInfo &info); + +// Specialization for point lock manager. +template <> +std::string dump_key(const rocksdb::KeyLockInfo &info) { + return rdb_hexdump(info.key.c_str(), info.key.length(), FN_REFLEN); +} + +// Specialization for Range Lock manager. +template <> +std::string dump_key( + const rocksdb::RangeLockInfo &info) { + // todo: Do we need to enforce the total length limit of FN_REFLEN? + return rdb_hexdump_range(info.start, info.end); +} + +// +// A template that walks the Lock info data structure and dumps its contents. +// +template +int dump_locks(my_core::THD *thd, my_core::TABLE *table, LockInfo &lock_info) { + for (const auto &lock : lock_info) { + const uint32_t cf_id = lock.first; + const auto &one_lock_info = lock.second; + + // Call the templated function to get string representation of the acquired + // lock + std::string key_hexstr = dump_key(one_lock_info); + + for (const auto &id : one_lock_info.ids) { + table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, true); + table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true); + + table->field[RDB_LOCKS_FIELD::KEY]->store( + key_hexstr.c_str(), key_hexstr.size(), system_charset_info); + table->field[RDB_LOCKS_FIELD::MODE]->store( + one_lock_info.exclusive ? "X" : "S", 1, system_charset_info); + + /* Tell MySQL about this row in the virtual table */ + int ret = + static_cast(my_core::schema_table_store_record(thd, table)); + + if (ret != 0) { + return ret; + } + } + } + return 0; +} + /* Fill the information_schema.rocksdb_locks virtual table */ static int rdb_i_s_lock_info_fill_table( my_core::THD *const thd, my_core::TABLE_LIST *const tables, @@ -2014,36 +2067,16 @@ static int rdb_i_s_lock_info_fill_table( DBUG_RETURN(ret); } - /* cf id -> rocksdb::KeyLockInfo */ - std::unordered_multimap lock_info = - rdb->GetLockStatusData(); - - for (const auto &lock : lock_info) { - const uint32_t cf_id = lock.first; - const auto &key_lock_info = lock.second; - const auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(), - key_lock_info.key.length(), FN_REFLEN); - - for (const auto &id : key_lock_info.ids) { - tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, - true); - tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true); - - tables->table->field[RDB_LOCKS_FIELD::KEY]->store( - key_hexstr.c_str(), key_hexstr.size(), system_charset_info); - tables->table->field[RDB_LOCKS_FIELD::MODE]->store( - key_lock_info.exclusive ? "X" : "S", 1, system_charset_info); - - /* Tell MySQL about this row in the virtual table */ - ret = static_cast( - my_core::schema_table_store_record(thd, tables->table)); - - if (ret != 0) { - break; - } - } + if (range_lock_mgr) { + // Use Range Lock Manager's interface for obtaining more specific + // information about the acquired locks + auto lock_info = range_lock_mgr->GetRangeLockStatusData(); + ret = dump_locks(thd, tables->table, lock_info); + } else { + // Obtain information from point lock manager + auto lock_info = rdb->GetLockStatusData(); + ret = dump_locks(thd, tables->table, lock_info); } - DBUG_RETURN(ret); } diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc index 11e538370fde..5e532b940782 100644 --- a/storage/rocksdb/rdb_iterator.cc +++ b/storage/rocksdb/rdb_iterator.cc @@ -36,6 +36,8 @@ Rdb_iterator_base::Rdb_iterator_base(THD *thd, m_tbl_def(tbl_def), m_thd(thd), m_scan_it(nullptr), + m_iter_uses_locking(false), + m_iter_should_use_locking(false), m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr), m_scan_it_lower_bound(nullptr), @@ -83,7 +85,7 @@ int Rdb_iterator_base::read_before_key(const bool full_key_match, return HA_EXIT_SUCCESS; } - return HA_ERR_END_OF_FILE; + return iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE); } int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) { @@ -98,12 +100,15 @@ int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) { */ rocksdb_smart_seek(m_kd->m_is_reverse_cf, m_scan_it, key_slice); - return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_END_OF_FILE; + return is_valid_iterator(m_scan_it) + ? HA_EXIT_SUCCESS + : iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE); } void Rdb_iterator_base::release_scan_iterator() { delete m_scan_it; m_scan_it = nullptr; + m_iter_uses_locking = false; if (m_scan_it_snapshot) { auto rdb = rdb_get_rocksdb_db(); @@ -154,7 +159,8 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice, and re-create Iterator. */ - if (m_scan_it_skips_bloom != skip_bloom) { + if (m_scan_it_skips_bloom != skip_bloom || + m_iter_uses_locking != m_iter_should_use_locking) { release_scan_iterator(); } @@ -164,10 +170,11 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice, */ if (!m_scan_it) { m_scan_it = rdb_tx_get_iterator( - m_thd, m_kd->get_cf(), skip_bloom, m_scan_it_lower_bound_slice, + m_thd, m_kd->get_cf(), m_kd, skip_bloom, m_scan_it_lower_bound_slice, m_scan_it_upper_bound_slice, &m_scan_it_snapshot, read_current, - !read_current); + !read_current, m_iter_should_use_locking); m_scan_it_skips_bloom = skip_bloom; + m_iter_uses_locking = m_iter_should_use_locking; } } @@ -208,6 +215,18 @@ int Rdb_iterator_base::calc_eq_cond_len(enum ha_rkey_function find_flag, return Rdb_key_def::INDEX_NUMBER_SIZE; } +int Rdb_iterator_base::iter_status_to_retval( + rocksdb::Iterator *it, const std::shared_ptr kd, + int not_found_code) { + if (it->Valid()) return HA_EXIT_SUCCESS; + + rocksdb::Status s = it->status(); + if (s.ok() || s.IsNotFound()) return not_found_code; + + Rdb_transaction *tx = get_tx_from_thd(m_thd); + return rdb_tx_set_status_error(tx, s, *kd, m_tbl_def); +} + int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) { int rc = 0; const auto &kd = *m_kd; @@ -237,7 +256,7 @@ int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) { } if (!is_valid_iterator(m_scan_it)) { - rc = HA_ERR_END_OF_FILE; + rc = iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE); break; } @@ -617,8 +636,15 @@ int Rdb_iterator_partial::materialize_prefix() { const char *old_proc_info = m_thd->get_proc_info(); thd_proc_info(m_thd, "Materializing group in partial index"); - auto s = - rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, true, false); + rocksdb::Status s; + if (rocksdb_use_range_locking) { + rocksdb::Endpoint start_endp(cur_prefix_key, false); + rocksdb::Endpoint end_endp(cur_prefix_key, true); + s = rdb_tx_lock_range(tx, *m_kd, start_endp, end_endp); + } else { + s = rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, true, false); + } + if (!s.ok()) { thd_proc_info(m_thd, old_proc_info); return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def); @@ -815,7 +841,13 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag, return HA_ERR_INTERNAL_ERROR; } + // Save the value because reset() clears it: + auto save_iter_should_use_locking = m_iter_should_use_locking; reset(); + m_iter_should_use_locking = save_iter_should_use_locking; + // Range Locking: when the iterator does a locking read, + // both secondary and primary key iterators should use locking reads. + if (m_iter_should_use_locking) m_iterator_pk.set_use_locking(); bool direction = (find_flag == HA_READ_KEY_EXACT) || (find_flag == HA_READ_AFTER_KEY) || @@ -831,6 +863,15 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag, m_cur_prefix_key_len); m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + // + // Range Locking note: When using a locking iterator (i.e. + // m_iter_should_use_locking=true), this will read (and lock) + // and the value space up to the next prefix. + // If the next prefix is not materialized, it will lock the whole + // prefix in the secondary key. It will not lock more than that, + // because the iterator use the iterator bounds to limit the scan + // to the prefix specified. + // rc = Rdb_iterator_base::seek(find_flag, start_key, true, end_key, read_current); diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h index 7fd1ff470c7a..ad4e8990af56 100644 --- a/storage/rocksdb/rdb_iterator.h +++ b/storage/rocksdb/rdb_iterator.h @@ -69,6 +69,8 @@ class Rdb_iterator { virtual rocksdb::Slice key() = 0; virtual rocksdb::Slice value() = 0; virtual void reset() = 0; + + virtual void set_use_locking() = 0; }; class Rdb_iterator_base : public Rdb_iterator { @@ -107,7 +109,12 @@ class Rdb_iterator_base : public Rdb_iterator { rocksdb::Slice value() override { return m_scan_it->value(); } - void reset() override { release_scan_iterator(); } + void reset() override { + m_iter_should_use_locking = false; + release_scan_iterator(); + } + + void set_use_locking() override { m_iter_should_use_locking = true; } protected: friend class Rdb_iterator; @@ -123,6 +130,18 @@ class Rdb_iterator_base : public Rdb_iterator { /* Iterator used for range scans and for full table/index scans */ rocksdb::Iterator *m_scan_it; + /* Whether m_scan_it is a locking iterator */ + bool m_iter_uses_locking; + + /* + Whether the iterator should use locking. The intended workflow is: + + iter.set_use_locking() // sets m_iter_should_use_locking=true + iter.seek() // this will re-create m_scan_it to be the right kind + // of iterator + */ + bool m_iter_should_use_locking; + /* Whether m_scan_it was created with skip_bloom=true */ bool m_scan_it_skips_bloom; @@ -136,8 +155,17 @@ class Rdb_iterator_base : public Rdb_iterator { uchar *m_prefix_buf; rocksdb::Slice m_prefix_tuple; + + int iter_status_to_retval(rocksdb::Iterator *it, + const std::shared_ptr kd, + int not_found_code); }; +/* + Iterator for reading partial secondary indexes + + It can do locking reads, see locking_iterator_partial_index.txt for details. +*/ class Rdb_iterator_partial : public Rdb_iterator_base { private: TABLE *m_table; diff --git a/storage/rocksdb/rdb_locking_iter.cc b/storage/rocksdb/rdb_locking_iter.cc new file mode 100644 index 000000000000..c4b37d522457 --- /dev/null +++ b/storage/rocksdb/rdb_locking_iter.cc @@ -0,0 +1,278 @@ + +#ifdef USE_PRAGMA_IMPLEMENTATION +#pragma implementation // gcc: Class implementation +#endif + +#define MYSQL_SERVER 1 + +/* This C++ file's header file */ +#include "./rdb_locking_iter.h" + +namespace myrocks { + +rocksdb::Iterator *GetLockingIterator( + rocksdb::Transaction *trx, const rocksdb::ReadOptions &read_options, + rocksdb::ColumnFamilyHandle *column_family, + const std::shared_ptr &kd, ulonglong *counter) { + return new LockingIterator(trx, column_family, kd, read_options, counter); +} + +/* + @brief + Seek to the first key K that is equal or greater than target, + locking the range [target; K]. +*/ + +void LockingIterator::Seek(const rocksdb::Slice &target) { + m_have_locked_until = false; + m_iter = m_txn->GetIterator(m_read_opts, m_cfh); + m_iter->Seek(target); + ScanForward(target, false); +} + +void LockingIterator::SeekForPrev(const rocksdb::Slice &target) { + m_have_locked_until = false; + m_iter = m_txn->GetIterator(m_read_opts, m_cfh); + m_iter->SeekForPrev(target); + ScanBackward(target, false); +} + +/* + @brief + Move the iterator to the next key, locking the range between the current + and the next key. + + @detail + Implementation is similar to Seek(next_key). Since we don't know what the + next_key is, we reach it by calling { Seek(current_key); Next(); } +*/ +void LockingIterator::Next() { + DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.LockingIterator.Next"); + assert(Valid()); + // Save the current key value. We need it as the left endpoint + // of the range lock we're going to acquire + std::string current_key = m_iter->key().ToString(); + + m_iter->Next(); + ScanForward(rocksdb::Slice(current_key), true); +} + +/* + @brief + Move the iterator to the previous key, locking the range between the current + and the previous key. +*/ + +void LockingIterator::Prev() { + assert(Valid()); + + std::string current_key = m_iter->key().ToString(); + m_iter->Prev(); + ScanBackward(rocksdb::Slice(current_key), true); +} + +/* + @brief + Lock range from target to end_key. + + @detail + In forward-ordered scan, target < end_key. In backward-ordered scan, it's + other way around. + + We might have already locked a subset of this range, a subrange that + starts from target and extends to some point between target and end_key. +*/ +void LockingIterator::lock_up_to(bool scan_forward, + const rocksdb::Slice &target, + const rocksdb::Slice &end_key) { + const int inv = scan_forward ? 1 : -1; + auto cmp = m_cfh->GetComparator(); + bool endp_arg = m_kd->m_is_reverse_cf; + + if (m_have_locked_until && + cmp->Compare(end_key, rocksdb::Slice(m_locked_until)) * inv <= 0) { + // We've already locked this range. The following has happened: + // - m_iter->key() returned $KEY + // - other transaction(s) have inserted row $ROW before the $KEY. + // - we got a range lock on [range_start, $KEY] + // - we've read $ROW and returned. + // Now, we're looking to lock [$ROW, $KEY] but we don't need to, + // we already have a lock on this range. + } else { + if (scan_forward) { + m_status = m_txn->GetRangeLock(m_cfh, rocksdb::Endpoint(target, endp_arg), + rocksdb::Endpoint(end_key, endp_arg)); + } else { + m_status = + m_txn->GetRangeLock(m_cfh, rocksdb::Endpoint(end_key, endp_arg), + rocksdb::Endpoint(target, endp_arg)); + } + + if (!m_status.ok()) return; + + // Save the bound where we locked until: + m_have_locked_until = true; + m_locked_until.assign(end_key.data(), end_key.size()); + if (m_lock_count) (*m_lock_count)++; + } +} + +/* + Lock the range from target till the iterator end point that we are scaning + towards. If there's no iterator bound, use index start (or end, depending + on the scan direction) +*/ +void LockingIterator::lock_till_iterator_end(bool scan_forward, + const rocksdb::Slice &target) { + rocksdb::Slice end; + uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uint size; + if (scan_forward) { + if (m_read_opts.iterate_upper_bound) + end = *m_read_opts.iterate_upper_bound; + else { + if (m_kd->m_is_reverse_cf) + m_kd->get_infimum_key(buf, &size); + else + m_kd->get_supremum_key(buf, &size); + + DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE); + end = rocksdb::Slice((const char *)buf, size); + } + } else { + if (m_read_opts.iterate_lower_bound) + end = *m_read_opts.iterate_lower_bound; + else { + if (m_kd->m_is_reverse_cf) + m_kd->get_supremum_key(buf, &size); + else + m_kd->get_infimum_key(buf, &size); + + DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE); + end = rocksdb::Slice((const char *)buf, size); + } + } + // This will set m_status accordingly + lock_up_to(scan_forward, target, end); +} + +/* + Lock the range between [target, (current m_iter position)] and position + the iterator on the first record in it. + + @param call_next false means current iterator position is achieved by + calling Seek(target). + true means one also needs to call Next() +*/ +void LockingIterator::Scan(bool scan_forward, const rocksdb::Slice &target, + bool call_next) { + if (!m_iter->Valid()) { + m_status = m_iter->status(); + m_valid = false; + if (m_status.ok()) { + // m_iter has reached EOF + lock_till_iterator_end(scan_forward, target); + } + return; + } + + while (1) { + DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.locking_iter_scan"); + + if (my_core::thd_killed(current_thd)) { + m_status = rocksdb::Status::Aborted(); + m_valid = false; + return; + } + + const int inv = scan_forward ? 1 : -1; + auto cmp = m_cfh->GetComparator(); + + auto end_key = m_iter->key(); + std::string end_key_copy = end_key.ToString(); + + lock_up_to(scan_forward, target, end_key); + if (!m_status.ok()) { + // Failed to get a lock (most likely lock wait timeout) + m_valid = false; + return; + } + + // Ok, now we have a lock which is inhibiting modifications in the range + // Somebody might have done external modifications, though: + // - removed the key we've found + // - added a key before that key. + + // First, refresh the iterator: + delete m_iter; + m_iter = m_txn->GetIterator(m_read_opts, m_cfh); + + // Then, try seeking to the same row + if (scan_forward) + m_iter->Seek(target); + else + m_iter->SeekForPrev(target); + + if (call_next && m_iter->Valid() && !cmp->Compare(m_iter->key(), target)) { + if (scan_forward) + m_iter->Next(); + else + m_iter->Prev(); + } + + if (m_iter->Valid()) { + if (cmp->Compare(m_iter->key(), rocksdb::Slice(end_key_copy)) * inv <= + 0) { + // Ok, the found key is within the locked range. + m_status = rocksdb::Status::OK(); + m_valid = true; + break; + } else { + // We've got a key but it is outside the range we've locked. + // Re-try the lock-and-read step. + continue; + } + } else { + m_valid = false; + m_status = m_iter->status(); + if (m_status.ok()) { + // m_iter has reached EOF + lock_till_iterator_end(scan_forward, target); + } + break; + } + } +} + +/* + @detail + Ideally, this function should + - find the first key $first_key + - lock the range [-inf; $first_key] + - return, the iterator is positioned on $first_key + + The problem here is that we cannot have "-infinity" bound. + + Note: we don't have a practical use for this function - MyRocks always + searches within one index_name.table_name, which means we are only looking + at the keys with index_number as the prefix. +*/ + +void LockingIterator::SeekToFirst() { + DBUG_ASSERT(0); + m_status = rocksdb::Status::NotSupported("Not implemented"); + m_valid = false; +} + +/* + @detail + See SeekToFirst. +*/ + +void LockingIterator::SeekToLast() { + DBUG_ASSERT(0); + m_status = rocksdb::Status::NotSupported("Not implemented"); + m_valid = false; +} + +} // namespace myrocks diff --git a/storage/rocksdb/rdb_locking_iter.h b/storage/rocksdb/rdb_locking_iter.h new file mode 100644 index 000000000000..4342c9a7e936 --- /dev/null +++ b/storage/rocksdb/rdb_locking_iter.h @@ -0,0 +1,118 @@ + +/* MySQL header files */ +#include "./rdb_threads.h" /* for thd_get_current_thd */ +#include "sql/debug_sync.h" +#include "sql/handler.h" /* handler */ + +/* MyRocks header files */ +#include "./ha_rocksdb.h" +#include "./rdb_datadic.h" + +namespace myrocks { + +// +// LockingIterator is an iterator that locks the rows before returning, as well +// as scanned gaps between the rows. +// +// Example: +// lock_iter= trx->GetLockingIterator(); +// lock_iter->Seek('abc'); +// lock_iter->Valid()==true && lock_iter->key() == 'bcd'; +// +// After the above, the returned record 'bcd' is locked by transaction trx. +// Also, the range between ['abc'..'bcd'] is empty and locked by trx. +// +// lock_iter->Next(); +// lock_iter->Valid()==true && lock_iter->key() == 'efg' +// +// Now, the range ['bcd'.. 'efg'] (bounds inclusive) is also locked, and there +// are no records between 'bcd' and 'efg'. +// +class LockingIterator : public rocksdb::Iterator { + rocksdb::Transaction *m_txn; + rocksdb::ColumnFamilyHandle *m_cfh; + const std::shared_ptr &m_kd; + + rocksdb::ReadOptions m_read_opts; + rocksdb::Iterator *m_iter; + rocksdb::Status m_status; + + // note: an iterator that has reached EOF has status()==OK && m_valid==false + bool m_valid; + + ulonglong *m_lock_count; + + // If true, m_locked_until has a valid key value. + bool m_have_locked_until; + + // The key value until we've locked the range. That is, we have a range lock + // on [current_position ... m_locked_until]. + // This is used to avoid making extra GetRangeLock() calls. + std::string m_locked_until; + + public: + LockingIterator(rocksdb::Transaction *txn, rocksdb::ColumnFamilyHandle *cfh, + const std::shared_ptr &kd, + const rocksdb::ReadOptions &opts, + ulonglong *lock_count = nullptr) + : m_txn(txn), + m_cfh(cfh), + m_kd(kd), + m_read_opts(opts), + m_iter(nullptr), + m_status(rocksdb::Status::InvalidArgument()), + m_valid(false), + m_lock_count(lock_count), + m_have_locked_until(false) {} + + ~LockingIterator() { delete m_iter; } + + virtual bool Valid() const override { return m_valid; } + + // Note: MyRocks doesn't ever call these: + virtual void SeekToFirst() override; + virtual void SeekToLast() override; + + virtual void Seek(const rocksdb::Slice &target) override; + + // Position at the last key in the source that at or before target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + virtual void SeekForPrev(const rocksdb::Slice &target) override; + + virtual void Next() override; + virtual void Prev() override; + + virtual rocksdb::Slice key() const override { + assert(Valid()); + return m_iter->key(); + } + + virtual rocksdb::Slice value() const override { + assert(Valid()); + return m_iter->value(); + } + + virtual rocksdb::Status status() const override { return m_status; } + + private: + void lock_up_to(bool scan_forward, const rocksdb::Slice &target, + const rocksdb::Slice &end_key); + void lock_till_iterator_end(bool scan_forward, const rocksdb::Slice &target); + void Scan(bool scan_forward, const rocksdb::Slice &target, bool call_next); + + inline void ScanForward(const rocksdb::Slice &target, bool call_next) { + Scan(true, target, call_next); + } + + inline void ScanBackward(const rocksdb::Slice &target, bool call_next) { + Scan(false, target, call_next); + } +}; + +rocksdb::Iterator *GetLockingIterator( + rocksdb::Transaction *trx, const rocksdb::ReadOptions &read_options, + rocksdb::ColumnFamilyHandle *column_family, + const std::shared_ptr &kd, ulonglong *counter); + +} // namespace myrocks diff --git a/storage/rocksdb/rdb_utils.cc b/storage/rocksdb/rdb_utils.cc index a723ac9e8060..071d27698159 100644 --- a/storage/rocksdb/rdb_utils.cc +++ b/storage/rocksdb/rdb_utils.cc @@ -259,6 +259,31 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len, return str; } +/* + Print the range in hex, in "start_endpoint-end_endpoint" form +*/ + +std::string rdb_hexdump_range(const rocksdb::EndpointWithString &start, + const rocksdb::EndpointWithString &end) { + std::string res; + // For keys: :0 keys should look like point keys + if (!start.inf_suffix && !end.inf_suffix && (start.slice == end.slice)) { + // This is a single-point range, show it like a key + res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN); + } else { + res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN); + if (start.inf_suffix) res.append(":1"); + + res.append("-"); + + std::string key2 = + rdb_hexdump(end.slice.c_str(), end.slice.length(), FN_REFLEN); + if (end.inf_suffix) key2.append(":1"); + res.append(key2); + } + return res; +} + /* Attempt to access the database subdirectory to see if it exists */ diff --git a/storage/rocksdb/rdb_utils.h b/storage/rocksdb/rdb_utils.h index f49f102a08c2..608e02272e70 100644 --- a/storage/rocksdb/rdb_utils.h +++ b/storage/rocksdb/rdb_utils.h @@ -29,6 +29,7 @@ /* RocksDB header files */ #include "rocksdb/slice.h" #include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" /* MyRocks header files */ #include "./rdb_global.h" @@ -290,6 +291,8 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len, const std::size_t maxsize = 0) MY_ATTRIBUTE((__nonnull__)); +std::string rdb_hexdump_range(const rocksdb::EndpointWithString &left, + const rocksdb::EndpointWithString &right); /* Helper function to see if a database exists */