diff --git a/CMakeLists.txt b/CMakeLists.txt index 485ff0fe25..69db0069a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -575,7 +575,81 @@ endif() set(PROTOBUF_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) set(PROTOBUF_LIBRARY ${INSTALL_LIBDIR}/${LIB_PROTOBUF}) set(PROTOBUF_PROTOC ${STAGED_INSTALL_PREFIX}/bin/protoc) +set(USE_S3 1) +ExternalProject_Add(prometheus_cpp + URL + https://github.com/jupp0r/prometheus-cpp/releases/download/v1.2.4/prometheus-cpp-with-submodules.tar.gz + CMAKE_ARGS + -DBUILD_SHARED_LIBS=ON + -DENABLE_PUSH=OFF + -DENABLE_COMPRESSION=OFF + -DCMAKE_INSTALL_LIBDIR=${INSTALL_LIBDIR} + -DCMAKE_INSTALL_INCLUDEDIR=${INSTALL_INCLUDEDIR} + BUILD_ALWAYS + 1 + BUILD_COMMAND + make -j${CPU_CORE} +) + +set(PROMETHEUS_CPP_CORE_LIB ${INSTALL_LIBDIR}/libprometheus-cpp-core.so) +set(PROMETHEUS_CPP_PULL_LIB ${INSTALL_LIBDIR}/libprometheus-cpp-pull.so) + +if (USE_S3) +ExternalProject_Add(rocksdb + DEPENDS + gflags + gtest + snappy + zstd + lz4 + zlib + ${LIBGPERF_NAME} + ${LIBJEMALLOC_NAME} + URL + #temporary for debug, skip download from github + http://10.224.129.40:8000/async_upload/rocksdb_cloud.tar.gz + URL_HASH + MD5=31c2188019b0d9ebc11d4df42ce885f2 + DOWNLOAD_NO_PROGRESS + 1 + UPDATE_COMMAND + "" + LOG_CONFIGURE + 1 + LOG_BUILD + 1 + LOG_INSTALL + 1 + BUILD_ALWAYS + 1 + CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX=${STAGED_INSTALL_PREFIX} + -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} + -DCMAKE_BUILD_TYPE=${LIB_BUILD_TYPE} + -DUSE_RTTI=1 + -DWITH_BENCHMARK=OFF + -DWITH_BENCHMARK_TOOLS=OFF + -DWITH_TOOLS=OFF + -DWITH_CORE_TOOLS=OFF + -DWITH_TESTS=OFF + -DWITH_TRACE_TOOLS=OFF + -DWITH_EXAMPLES=OFF + -DROCKSDB_BUILD_SHARED=OFF + -DWITH_JEMALLOC=${JEMALLOC_ON} + -DWITH_LZ4=ON + -DWITH_SNAPPY=ON + -DWITH_ZLIB=ON + -DWITH_ZSTD=ON + -DWITH_GFLAGS=ON + -DFAIL_ON_WARNINGS=OFF + -DWITH_LIBURING=OFF + -DPORTABLE=1 + -DWITH_AWS=ON + BUILD_COMMAND + make -j${CPU_CORE} +) +else() ExternalProject_Add(rocksdb DEPENDS gflags @@ -627,6 +701,7 @@ ExternalProject_Add(rocksdb BUILD_COMMAND make -j${CPU_CORE} ) +endif() ExternalProject_Add(rediscache URL @@ -725,6 +800,10 @@ endif() set(ROCKSDB_INCLUDE_DIR ${INSTALL_INCLUDEDIR}) set(ROCKSDB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${EP_BASE_SUFFIX}/Source/rocksdb) +if(USE_S3) + add_compile_definitions(USE_S3) +endif() + add_subdirectory(src/pstd) add_subdirectory(src/net) add_subdirectory(src/storage) @@ -734,7 +813,7 @@ if (USE_PIKA_TOOLS) endif() aux_source_directory(src DIR_SRCS) -# # generate version +# generate version string(TIMESTAMP TS "%Y-%m-%d %H:%M:%S" UTC) set(PIKA_BUILD_DATE "${TS}" CACHE STRING "the time we first built pika") @@ -769,7 +848,9 @@ set(PIKA_BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/pika_build_version.cc message("PIKA_BUILD_VERSION_CC : " ${PIKA_BUILD_VERSION_CC}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/build_version.cc.in ${PIKA_BUILD_VERSION_CC} @ONLY) -set(PROTO_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/pika_inner_message.proto ${CMAKE_CURRENT_SOURCE_DIR}/src/rsync_service.proto) +set(PROTO_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/pika_inner_message.proto + ${CMAKE_CURRENT_SOURCE_DIR}/src/rsync_service.proto + ${CMAKE_CURRENT_SOURCE_DIR}/src/pika_cloud_binlog.proto) custom_protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES}) message("pika PROTO_SRCS = ${PROTO_SRCS}") message("pika PROTO_HDRS = ${PROTO_HDRS}") @@ -784,6 +865,17 @@ target_link_directories(${PROJECT_NAME} PUBLIC ${INSTALL_LIBDIR_64} PUBLIC ${INSTALL_LIBDIR}) +if (USE_S3) + find_package(AWSSDK REQUIRED COMPONENTS s3-crt transfer kinesis) + include_directories(${AWS_INCLUDE_DIR}) + target_link_libraries(${PROJECT_NAME} ${AWSSDK_LINK_LIBRARIES}) +endif() + +# SSL +find_package(OpenSSL REQUIRED) +include_directories(${OPENSSL_INCLUDE_DIR}) +target_link_libraries(${PROJECT_NAME} OpenSSL::SSL OpenSSL::Crypto) + add_dependencies(${PROJECT_NAME} gflags gtest @@ -827,7 +919,8 @@ target_link_libraries(${PROJECT_NAME} libz.a librediscache.a ${LIBUNWIND_LIBRARY} - ${JEMALLOC_LIBRARY}) + ${JEMALLOC_LIBRARY} +) option(USE_SSL "Enable SSL support" OFF) add_custom_target( diff --git a/codis/config/dashboard.toml b/codis/config/dashboard.toml index 44ef06213a..4cb924ed3e 100644 --- a/codis/config/dashboard.toml +++ b/codis/config/dashboard.toml @@ -44,4 +44,9 @@ sentinel_down_after = "30s" sentinel_failover_timeout = "5m" sentinel_notification_script = "" sentinel_client_reconfig_script = "" +sentinel_pika_local_mode = false +cloud_access_key = "minioadmin" +cloud_secret_key = "minioadmin" +cloud_endpoint_override = "http://10.224.129.40:9000" +cloud_src_bucket_region = "us-east-1" diff --git a/codis/example/dashboard.py b/codis/example/dashboard.py index 62568cc83a..845aceab6d 100644 --- a/codis/example/dashboard.py +++ b/codis/example/dashboard.py @@ -43,7 +43,12 @@ def _open_config(admin_port, product_name, product_auth=None): f.write('sentinel_failover_timeout = "10m"\n') path = os.getcwd() f.write('sentinel_notification_script = "{}"\n'.format(os.path.join(path, "sentinel_notify.sh"))) - f.write('sentinel_client_reconfig_script = "{}"\n'.format(os.path.join(path, "sentinel_reconfig.sh"))) + f.write('sentinel_client_reconfig_script = "{}"\n'.format(os.path.join(path, "sentinel_reconfig.sh")) + f.write('sentinel_pika_local_mode = "true"\n') + f.write('cloud_access_key = "minioadmin"\n') + f.write('cloud_secret_key = "minioadmin"\n') + f.write('cloud_endpoint_override = "http://10.224.129.40:9000"\n') + f.write('cloud_src_bucket_region = "us-east-1"\n')) return config diff --git a/codis/go.mod b/codis/go.mod index e4af7493af..0dccd51976 100644 --- a/codis/go.mod +++ b/codis/go.mod @@ -8,6 +8,7 @@ replace google.golang.org/grpc => google.golang.org/grpc v1.29.0 require ( github.com/BurntSushi/toml v0.3.1 + github.com/aws/aws-sdk-go v1.30.12 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 github.com/emirpasic/gods v1.18.1 github.com/garyburd/redigo v1.6.4 @@ -18,6 +19,7 @@ require ( github.com/martini-contrib/render v0.0.0-20150707142108-ec18f8345a11 github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 github.com/spinlock/jemalloc-go v0.0.0-20201010032256-e81523fb8524 + github.com/stretchr/testify v1.8.0 go.etcd.io/etcd/client/v2 v2.305.7 golang.org/x/net v0.17.0 gopkg.in/alexcesaro/statsd.v2 v2.0.0 @@ -26,10 +28,14 @@ require ( require ( github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0 // indirect github.com/coreos/go-semver v0.3.1 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect go.etcd.io/etcd/api/v3 v3.5.7 // indirect go.etcd.io/etcd/client/pkg/v3 v3.5.7 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/codis/go.sum b/codis/go.sum index f30f9e17be..50dedd373d 100644 --- a/codis/go.sum +++ b/codis/go.sum @@ -1,5 +1,7 @@ github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/aws/aws-sdk-go v1.30.12 h1:KrjyosZvkpJjcwMk0RNxMZewQ47v7+ZkbQDXjWsJMs8= +github.com/aws/aws-sdk-go v1.30.12/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0 h1:sDMmm+q/3+BukdIpxwO365v/Rbspp2Nt5XntgQRXq8Q= github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM= github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4= @@ -15,10 +17,16 @@ github.com/garyburd/redigo v1.6.4 h1:LFu2R3+ZOPgSMWMOL+saa/zXRjw0ID2G8FepO53BGlg github.com/garyburd/redigo v1.6.4/go.mod h1:rTb6epsqigu3kYKBnaF028A7Tf/Aw5s0cqA47doKKqw= github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab h1:xveKWz2iaueeTaUgdetzel+U7exyigDYBryyVfV/rZk= github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab/go.mod h1:/P9AEU963A2AYjv4d1V5eVL1CQbEJq6aCNHDDjibzu8= +github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/google/go-cmp v0.5.7 h1:81/ik6ipDQS2aGcBfIN5dHDB36BwrStyeAQquSYCV4o= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/influxdata/influxdb v1.11.0 h1:0X+ZsbcOWc6AEi5MHee9BYqXCKmz8IZsljrRYjmV8Qg= github.com/influxdata/influxdb v1.11.0/go.mod h1:V93tJcidY0Zh0LtSONZWnXXGDyt20dtVf+Ddp4EnhaA= +github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/martini-contrib/binding v0.0.0-20160701174519-05d3e151b6cf h1:6YSkbjZVghliN7zwJC/U3QQG+OVXOrij3qQ8sxfPIMg= @@ -34,6 +42,7 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw= github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg= @@ -41,18 +50,32 @@ github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0 github.com/spinlock/jemalloc-go v0.0.0-20201010032256-e81523fb8524 h1:U+dpuWn15gFCqZkqhpUd5a85X1Oe1Tb+DeGF3nn6Bvs= github.com/spinlock/jemalloc-go v0.0.0-20201010032256-e81523fb8524/go.mod h1:A/ik9Cf2cSgEVcmTWlvTfCxyFgoL1UP/WbevsdDeguc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= go.etcd.io/etcd/api/v3 v3.5.7 h1:sbcmosSVesNrWOJ58ZQFitHMdncusIifYcrBfwrlJSY= go.etcd.io/etcd/api/v3 v3.5.7/go.mod h1:9qew1gCdDDLu+VwmeG+iFpL+QlpHTo7iubavdVDgCAA= go.etcd.io/etcd/client/pkg/v3 v3.5.7 h1:y3kf5Gbp4e4q7egZdn5T7W9TSHUvkClN6u+Rq9mEOmg= go.etcd.io/etcd/client/pkg/v3 v3.5.7/go.mod h1:o0Abi1MK86iad3YrWhgUsbGx1pmTS+hrORWc2CamuhY= go.etcd.io/etcd/client/v2 v2.305.7 h1:AELPkjNR3/igjbO7CjyF1fPuVPjrblliiKj+Y6xSGOU= go.etcd.io/etcd/client/v2 v2.305.7/go.mod h1:GQGT5Z3TBuAQGvgPfhR7VPySu/SudxmEkRq9BgzFU6s= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= gopkg.in/alexcesaro/statsd.v2 v2.0.0 h1:FXkZSCZIH17vLCO5sO2UucTHsH9pc+17F6pl3JVCwMc= gopkg.in/alexcesaro/statsd.v2 v2.0.0/go.mod h1:i0ubccKGzBVNBpdGV5MocxyA/XlLUJzA7SLonnE4drU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/codis/pkg/models/group.go b/codis/pkg/models/group.go index 092ec2f117..88aa8cbc21 100644 --- a/codis/pkg/models/group.go +++ b/codis/pkg/models/group.go @@ -7,6 +7,7 @@ const MaxGroupId = 9999 type Group struct { Id int `json:"id"` + TermId int `json:"term_id"` Servers []*GroupServer `json:"servers"` Promoting struct { diff --git a/codis/pkg/topom/config.go b/codis/pkg/topom/config.go index d1e0d44e5f..6e496ed827 100644 --- a/codis/pkg/topom/config.go +++ b/codis/pkg/topom/config.go @@ -61,6 +61,11 @@ sentinel_down_after = "30s" sentinel_failover_timeout = "5m" sentinel_notification_script = "" sentinel_client_reconfig_script = "" +sentinel_pika_local_mode = false +cloud_access_key = "minioadmin" +cloud_secret_key = "minioadmin" +cloud_endpoint_override = "http://10.224.129.40:9000" +cloud_src_bucket_region = "us-east-1" ` type Config struct { @@ -95,6 +100,11 @@ type Config struct { SentinelFailoverTimeout timesize.Duration `toml:"sentinel_failover_timeout" json:"sentinel_failover_timeout"` SentinelNotificationScript string `toml:"sentinel_notification_script" json:"sentinel_notification_script"` SentinelClientReconfigScript string `toml:"sentinel_client_reconfig_script" json:"sentinel_client_reconfig_script"` + SentinelPikaLocalMode bool `toml:"sentinel_pika_local_mode" json:"sentinel_pika_local_mode"` + CloudAccessKey string `toml:"cloud_access_key" json:"cloud_access_key"` + CloudSecretKey string `toml:"cloud_secret_key" json:"cloud_secret_key"` + CloudEndPointOverride string `toml:"cloud_endpoint_override" json:"cloud_endpoint_override"` + CloudSrcBucketRegion string `toml:"cloud_src_bucket_region" json:"cloud_src_bucket_region"` } func NewDefaultConfig() *Config { diff --git a/codis/pkg/topom/topom.go b/codis/pkg/topom/topom.go index f2c34f6b58..ca7186494e 100644 --- a/codis/pkg/topom/topom.go +++ b/codis/pkg/topom/topom.go @@ -197,7 +197,7 @@ func (s *Topom) Start(routines bool) error { return nil } - // Check the status of all masters and slaves every 5 seconds + // Check the status of all masters and slaves every 10 seconds gxruntime.GoUnterminated(func() { for !s.IsClosed() { if s.IsOnline() { diff --git a/codis/pkg/topom/topom_api.go b/codis/pkg/topom/topom_api.go index 6b8d9cc0e3..585fe90e2f 100644 --- a/codis/pkg/topom/topom_api.go +++ b/codis/pkg/topom/topom_api.go @@ -4,18 +4,20 @@ package topom import ( + "encoding/base64" + "encoding/json" "fmt" + "io" "net/http" "strconv" "strings" "time" - _ "net/http/pprof" - "github.com/go-martini/martini" "github.com/martini-contrib/binding" "github.com/martini-contrib/gzip" "github.com/martini-contrib/render" + _ "net/http/pprof" "pika/codis/v2/pkg/models" "pika/codis/v2/pkg/utils/errors" @@ -74,6 +76,7 @@ func newApiServer(t *Topom) http.Handler { r.Get("/xping/:xauth", api.XPing) r.Get("/stats/:xauth", api.Stats) r.Get("/slots/:xauth", api.Slots) + r.Post("/upload-s3", api.UploadManifestToS3) r.Put("/reload/:xauth", api.Reload) r.Put("/shutdown/:xauth", api.Shutdown) r.Put("/loglevel/:xauth/:value", api.LogLevel) @@ -500,6 +503,38 @@ func (s *apiServer) SyncRemoveAction(params martini.Params) (int, string) { } } +type UploadRequest struct { + GroupId int `json:"group_id"` + TermId int `json:"term_id"` + S3Bucket string `json:"s3_bucket"` + S3Path string `json:"s3_path"` + Content string `json:"content"` +} + +func (s *apiServer) UploadManifestToS3(req *http.Request) (int, string) { + body, err := io.ReadAll(req.Body) + if err != nil { + return rpc.ApiResponseError(err) + } + + var uploadReq UploadRequest + err = json.Unmarshal(body, &uploadReq) + if err != nil { + return rpc.ApiResponseError(err) + } + + content, err := base64.StdEncoding.DecodeString(uploadReq.Content) + if err != nil { + return rpc.ApiResponseError(err) + } + if err := s.topom.UploadManifestToS3(uploadReq.GroupId, uploadReq.TermId, uploadReq.S3Bucket, + uploadReq.S3Path, content); err != nil { + return rpc.ApiResponseError(err) + } else { + return rpc.ApiResponseJson("OK") + } +} + func (s *apiServer) SlotCreateAction(params martini.Params) (int, string) { if err := s.verifyXAuth(params); err != nil { return rpc.ApiResponseError(err) diff --git a/codis/pkg/topom/topom_group.go b/codis/pkg/topom/topom_group.go index 517fb2da4c..9836af1e3c 100644 --- a/codis/pkg/topom/topom_group.go +++ b/codis/pkg/topom/topom_group.go @@ -4,9 +4,17 @@ package topom import ( + "bytes" + "encoding/binary" "encoding/json" + "os" "time" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3/s3manager" + "pika/codis/v2/pkg/models" "pika/codis/v2/pkg/utils/errors" "pika/codis/v2/pkg/utils/log" @@ -31,6 +39,7 @@ func (s *Topom) CreateGroup(gid int) error { g := &models.Group{ Id: gid, + TermId: 0, Servers: []*models.GroupServer{}, } return s.storeCreateGroup(g) @@ -319,6 +328,7 @@ func (s *Topom) GroupPromoteServer(gid int, addr string) error { g = &models.Group{ Id: g.Id, + TermId: g.TermId, Servers: g.Servers, } return s.storeUpdateGroup(g) @@ -517,6 +527,7 @@ func (s *Topom) doSwitchGroupMaster(g *models.Group, newMasterAddr string, newMa g.Servers[newMasterIndex].Role = models.RoleMaster g.Servers[newMasterIndex].Action.State = models.ActionSynced g.Servers[0], g.Servers[newMasterIndex] = g.Servers[newMasterIndex], g.Servers[0] + g.TermId++ defer func() { err = s.storeUpdateGroup(g) // clean cache whether err is nil or not @@ -784,3 +795,61 @@ func (s *Topom) newSyncActionExecutor(addr string) (func() error, error) { } }, nil } + +func (s *Topom) UploadManifestToS3(gid int, tid int, bucket string, filename string, content []byte) error { + ctx, err := s.newContext() + if err != nil { + return err + } + if gid <= 0 || gid > models.MaxGroupId { + return errors.Errorf("invalid group id = %d, out of range", gid) + } + + group, exists := ctx.group[gid] + + if !exists { + return errors.Errorf("group-[%d] not exists", gid) + } + + if group.TermId != tid { + return errors.Errorf("group-[%d] term id:[%d] not equal to pika term id:[%d]", + gid, ctx.group[gid].TermId, tid) + } + + sess, err := session.NewSession(&aws.Config{ + Credentials: credentials.NewStaticCredentials(s.Config().CloudAccessKey, + s.Config().CloudSecretKey, ""), + Endpoint: aws.String(s.Config().CloudEndPointOverride), + Region: aws.String(s.Config().CloudSrcBucketRegion), + DisableSSL: aws.Bool(true), + S3ForcePathStyle: aws.Bool(true), + DisableEndpointHostPrefix: aws.Bool(true), + }) + + file, err := os.Create("./upload-manifest") + if err != nil { + return errors.Errorf("Create manifest file err :[%s]", err) + } + defer file.Close() + buf := new(bytes.Buffer) + err = binary.Write(buf, binary.LittleEndian, content) + if err != nil { + return errors.Errorf("Write binary manifest err :[%s]", err) + } + _, err = file.Write(buf.Bytes()) + if err != nil { + return errors.Errorf("Write manifest file err :[%s]", err) + } + + uploader := s3manager.NewUploader(sess) + _, err = uploader.Upload(&s3manager.UploadInput{ + Bucket: aws.String(bucket), + Key: aws.String(filename), + Body: file, + }) + if err != nil { + return errors.Errorf("Unable to upload [%s] to [%s], [%s]", filename, bucket, err) + } + + return nil +} diff --git a/codis/pkg/topom/topom_sentinel.go b/codis/pkg/topom/topom_sentinel.go index 3ea8b3cd9f..a06f073e85 100644 --- a/codis/pkg/topom/topom_sentinel.go +++ b/codis/pkg/topom/topom_sentinel.go @@ -16,12 +16,25 @@ func (s *Topom) CheckStateAndSwitchSlavesAndMasters(filter func(index int, g *mo return err } + var states []*redis.ReplicationState groupServers := filterGroupServer(ctx.getGroupServers(), filter) if len(groupServers) == 0 { return nil } + if s.Config().SentinelPikaLocalMode { + states = checkGroupServersReplicationState(s.Config(), groupServers) + } else { + groups_info := make(map[int]int) + for gid, _ := range groupServers { + group, err := ctx.getGroup(gid) + if err != nil { + continue + } + groups_info[gid] = group.TermId + } + states = checkGroupServersPKPingState(s.Config(), groupServers, groups_info) + } - states := checkGroupServersReplicationState(s.Config(), groupServers) var slaveOfflineGroups []*models.Group var masterOfflineGroups []*models.Group var recoveredGroupServersState []*redis.ReplicationState @@ -111,6 +124,20 @@ func checkGroupServersReplicationState(conf *Config, gs map[int][]*models.GroupS return sentinel.RefreshMastersAndSlavesClient(config.ParallelSyncs, gs) } +func checkGroupServersPKPingState(conf *Config, gs map[int][]*models.GroupServer, groups_info map[int]int) []*redis.ReplicationState { + config := &redis.MonitorConfig{ + Quorum: conf.SentinelQuorum, + ParallelSyncs: conf.SentinelParallelSyncs, + DownAfter: conf.SentinelDownAfter.Duration(), + FailoverTimeout: conf.SentinelFailoverTimeout.Duration(), + NotificationScript: conf.SentinelNotificationScript, + ClientReconfigScript: conf.SentinelClientReconfigScript, + } + + sentinel := redis.NewCodisSentinel(conf.ProductName, conf.ProductAuth) + return sentinel.RefreshMastersAndSlavesClientWithPKPing(config.ParallelSyncs, gs, groups_info) +} + func filterGroupServer(groupServers map[int][]*models.GroupServer, filter func(index int, gs *models.GroupServer) bool) map[int][]*models.GroupServer { filteredGroupServers := make(map[int][]*models.GroupServer) diff --git a/codis/pkg/utils/redis/client.go b/codis/pkg/utils/redis/client.go index 5f751321db..3063b781e9 100644 --- a/codis/pkg/utils/redis/client.go +++ b/codis/pkg/utils/redis/client.go @@ -436,6 +436,16 @@ func (c *Client) Role() (string, error) { } } +func (c *Client) PKPing(group_info []byte) (*InfoReplication, error) { + //waiting to complete + text, err := redigo.String(c.Do("PKPing", group_info)) + if err != nil { + return nil, errors.Trace(err) + } + + return parseInfoReplication(text) +} + var ErrClosedPool = errors.New("use of closed redis pool") type Pool struct { diff --git a/codis/pkg/utils/redis/codis_sentinel.go b/codis/pkg/utils/redis/codis_sentinel.go index 4d1ce73bed..f8f9b89da1 100644 --- a/codis/pkg/utils/redis/codis_sentinel.go +++ b/codis/pkg/utils/redis/codis_sentinel.go @@ -2,6 +2,7 @@ package redis import ( "context" + "encoding/json" "fmt" "time" @@ -99,12 +100,92 @@ func (s *CodisSentinel) RefreshMastersAndSlavesClient(parallel int, groupServers fut.Add() go func(gid, index int, server *models.GroupServer) { + var state *ReplicationState defer func() { + fut.Done(fmt.Sprintf("%d_%d", gid, index), state) <-limit }() - info, err := s.infoReplicationDispatch(server.Addr) - state := &ReplicationState{ + state = &ReplicationState{ + Index: index, + GroupID: gid, + Addr: server.Addr, + Server: server, + Replication: info, + Err: err, + } + }(gid, index, server) + } + } + + results := make([]*ReplicationState, 0) + + for _, v := range fut.Wait() { + switch val := v.(type) { + case *ReplicationState: + if val != nil { + results = append(results, val) + } + } + } + + return results +} + +type GroupInfo struct { + GroupId int `json:"group_id"` + TermId int `json:"term_id"` + MastersAddr []string `json:"masters_addr"` + SlavesAddr []string `json:"slaves_addr"` +} + +func (s *CodisSentinel) RefreshMastersAndSlavesClientWithPKPing(parallel int, groupServers map[int][]*models.GroupServer, groups_info map[int]int) []*ReplicationState { + if len(groupServers) == 0 { + s.printf("there's no groups") + return nil + } + + parallel = math2.MaxInt(10, parallel) + limit := make(chan struct{}, parallel) + defer close(limit) + + var fut sync2.Future + + //build pkping parameter + groups_parameter := make(map[int]GroupInfo) + for gid, servers := range groupServers { + group_info := groups_parameter[gid] + group_info.GroupId = gid + group_info.TermId = groups_info[gid] + for _, server := range servers { + if server.Role == models.RoleMaster { + group_info.MastersAddr = append(group_info.MastersAddr, server.Addr) + } + + if server.Role == models.RoleSlave { + group_info.SlavesAddr = append(group_info.SlavesAddr, server.Addr) + } + } + groups_parameter[gid] = group_info + } + + for gid, servers := range groupServers { + group_info_json, err := json.Marshal(groups_parameter[gid]) + if err != nil { + log.WarnErrorf(err, "json: %s Serialization Failure failed", group_info_json) + } + for index, server := range servers { + limit <- struct{}{} + fut.Add() + + go func(gid, index int, server *models.GroupServer) { + var state *ReplicationState + defer func() { + fut.Done(fmt.Sprintf("%d_%d", gid, index), state) + <-limit + }() + info, err := s.PkPingDispatch(server.Addr, group_info_json) + state = &ReplicationState{ Index: index, GroupID: gid, Addr: server.Addr, @@ -112,7 +193,6 @@ func (s *CodisSentinel) RefreshMastersAndSlavesClient(parallel int, groupServers Replication: info, Err: err, } - fut.Done(fmt.Sprintf("%d_%d", gid, index), state) }(gid, index, server) } } @@ -143,3 +223,16 @@ func (s *CodisSentinel) infoReplicationDispatch(addr string) (*InfoReplication, defer client.Close() return client.InfoReplication() } + +func (s *CodisSentinel) PkPingDispatch(addr string, group_info []byte) (*InfoReplication, error) { + var ( + client *Client + err error + ) + if client, err = NewClient(addr, s.Auth, time.Second); err != nil { + log.WarnErrorf(err, "create redis client to %s failed", addr) + return nil, err + } + defer client.Close() + return client.PKPing(group_info) +} diff --git a/conf/pika.conf b/conf/pika.conf index 2f6990b959..e8792ee220 100644 --- a/conf/pika.conf +++ b/conf/pika.conf @@ -512,3 +512,31 @@ cache-lfu-decay-time: 1 # # aclfile : ../conf/users.acl +###################################################################### +# rocksdb-cloud options +####################################################################### + +# Normally, the AWS SDK will automatically determine the endpoint based on the selected region. +# However, in special cases, you can manually specify the URL of the endpoint through this configuration, +# such as local development. +# Default: "" +cloud-endpoint-override : 10.224.129.40:9000 + +# The aws access key id and aws secret key used for authentication when accessing aws s3. +cloud-access-key : minioadmin +cloud-secret-key : minioadmin + +# The source bucket name prefix and suffix to use for storage on s3 +# The final bucket name is [prefix][suffix] +# Default: "pika." +# cloud-src-bucket-prefix : +# Default: "database" +# cloud-src-bucket-suffix : + +# The source bucket region +# cloud-src-bucket-region : + +# Configuration information of the destination bucket +# cloud-dest-bucket-prefix : +# cloud-dest-bucket-suffix : +# cloud-dest-bucket-region : diff --git a/include/pika_admin.h b/include/pika_admin.h index 7693f0329d..daf138998d 100644 --- a/include/pika_admin.h +++ b/include/pika_admin.h @@ -260,6 +260,8 @@ class InfoCmd : public Cmd { kInfoCache }; + friend class PKPingCmd; + InfoCmd(const std::string& name, int arity, uint32_t flag) : Cmd(name, arity, flag) {} void Do() override; void Split(const HintKeys& hint_keys) override {}; @@ -595,6 +597,24 @@ class ClearCacheCmd : public Cmd { void DoInitial() override; }; +class PKPingCmd : public Cmd { + public: + PKPingCmd(const std::string& name, int arity, uint32_t flag) : Cmd(name, arity, flag) {} + void Do() override; + void Split(const HintKeys& hint_keys) override {}; + void Merge() override {}; + Cmd* Clone() override { return new PKPingCmd(*this); } + + private: + uint32_t group_id_ = 0; + uint32_t term_id_ = 0; + std::vector masters_addr_; + std::vector slaves_addr_; + + void DoInitial() override; + void Clear() override {} +}; + #ifdef WITH_COMMAND_DOCS class CommandCmd : public Cmd { public: diff --git a/include/pika_binlog.h b/include/pika_binlog.h index 84127fb535..980a668bbb 100644 --- a/include/pika_binlog.h +++ b/include/pika_binlog.h @@ -12,6 +12,7 @@ #include "pstd/include/pstd_mutex.h" #include "pstd/include/pstd_status.h" #include "pstd/include/noncopyable.h" +#include "pstd/include/pstd_wal.h" #include "include/pika_define.h" std::string NewFileName(const std::string& name, uint32_t current); @@ -43,39 +44,43 @@ class Version final : public pstd::noncopyable { std::shared_ptr save_; }; -class Binlog : public pstd::noncopyable { +class Binlog : public pstd::WalWriter { public: Binlog(std::string Binlog_path, int file_size = 100 * 1024 * 1024); - ~Binlog(); + virtual ~Binlog(); void Lock() { mutex_.lock(); } void Unlock() { mutex_.unlock(); } - pstd::Status Put(const std::string& item); + virtual pstd::Status Put(const std::string& item); - pstd::Status GetProducerStatus(uint32_t* filenum, uint64_t* pro_offset, uint32_t* term = nullptr, uint64_t* logic_id = nullptr); + virtual pstd::Status Put(const std::string& item, uint32_t db_id, uint32_t rocksdb_id, uint32_t type) override; + + virtual pstd::Status GetProducerStatus(uint32_t* filenum, uint64_t* pro_offset, uint32_t* term = nullptr, uint64_t* logic_id = nullptr); + + virtual pstd::Status GetOldestBinlogToKeep(uint32_t* filenum, uint32_t* term = nullptr, uint64_t* logic_id = nullptr); /* * Set Producer pro_num and pro_offset with lock */ - pstd::Status SetProducerStatus(uint32_t pro_num, uint64_t pro_offset, uint32_t term = 0, uint64_t index = 0); + virtual pstd::Status SetProducerStatus(uint32_t pro_num, uint64_t pro_offset, uint32_t term = 0, uint64_t index = 0); // Need to hold Lock(); - pstd::Status Truncate(uint32_t pro_num, uint64_t pro_offset, uint64_t index); + virtual pstd::Status Truncate(uint32_t pro_num, uint64_t pro_offset, uint64_t index); - std::string filename() { return filename_; } + virtual std::string filename() { return filename_; } // need to hold mutex_ - void SetTerm(uint32_t term) { + virtual void SetTerm(uint32_t term) { std::lock_guard l(version_->rwlock_); version_->term_ = term; version_->StableSave(); } - uint32_t term() { + virtual uint32_t term() { std::shared_lock l(version_->rwlock_); return version_->term_; } - void Close(); + virtual void Close(); private: pstd::Status Put(const char* item, int len); diff --git a/include/pika_binlog_reader.h b/include/pika_binlog_reader.h index 1d604b02f7..c23591f962 100644 --- a/include/pika_binlog_reader.h +++ b/include/pika_binlog_reader.h @@ -27,6 +27,12 @@ class PikaBinlogReader { bool ReadToTheEnd(); void GetReaderStatus(uint32_t* cur_filenum, uint64_t* cur_offset); + static void GetFirstOffset(const std::shared_ptr& logger, uint32_t filenum, uint64_t* offset) { + PikaBinlogReader reader; + reader.Seek(logger, filenum, 0); + reader.GetReaderStatus(&filenum, offset); + } + private: bool GetNext(uint64_t* size); unsigned int ReadPhysicalRecord(pstd::Slice* result, uint32_t* filenum, uint64_t* offset); diff --git a/include/pika_binlog_transverter.h b/include/pika_binlog_transverter.h index d85d958667..a2f9b9364d 100644 --- a/include/pika_binlog_transverter.h +++ b/include/pika_binlog_transverter.h @@ -7,6 +7,7 @@ #define PIKA_BINLOG_TRANSVERTER_H_ #include + #include #include #include diff --git a/include/pika_cloud_binlog.h b/include/pika_cloud_binlog.h new file mode 100644 index 0000000000..869587ddbc --- /dev/null +++ b/include/pika_cloud_binlog.h @@ -0,0 +1,118 @@ +// Copyright (c) 2015-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef PIKA_CLOUD_BINLOG_H_ +#define PIKA_CLOUD_BINLOG_H_ + +#include + +#include "pstd/include/env.h" +#include "pstd/include/pstd_mutex.h" +#include "pstd/include/pstd_status.h" +#include "pstd/include/noncopyable.h" +#include "include/pika_define.h" +#include "include/pika_binlog.h" + +std::string NewFileName(const std::string& name, uint32_t current); + +class CloudVersion final : public pstd::noncopyable { + public: + CloudVersion(const std::shared_ptr& save); + ~CloudVersion(); + + pstd::Status Init(); + + // RWLock should be held when access members. + pstd::Status StableSave(); + + uint32_t pro_num_ = 0; + uint64_t pro_offset_ = 0; + uint32_t term_ = 0; + uint32_t keep_filenum_ = 0; + uint64_t keep_offset_ = 0; + + std::shared_mutex rwlock_; + + void debug() { + std::shared_lock l(rwlock_); + printf("Current pro_num %u pro_offset %llu\n", pro_num_, pro_offset_); + } + + private: + // shared with versionfile_ + std::shared_ptr save_; +}; + +class CloudBinlog : public Binlog { + public: + CloudBinlog(std::string Binlog_path, int file_size = 100 * 1024 * 1024); + ~CloudBinlog(); + + pstd::Status Put(const std::string& item) override; + + pstd::Status Put(const std::string& item, uint32_t db_id, uint32_t rocksdb_id, uint32_t type) override; + + pstd::Status GetProducerStatus(uint32_t* filenum, uint64_t* pro_offset, uint32_t* term = nullptr, uint64_t* logic_id = nullptr) override; + + pstd::Status GetOldestBinlogToKeep(uint32_t* filenum, uint32_t* term = nullptr, uint64_t* logic_id = nullptr) override; + /* + * Set Producer pro_num and pro_offset with lock + */ + pstd::Status SetProducerStatus(uint32_t pro_num, uint64_t pro_offset, uint32_t term = 0, uint64_t index = 0) override; + // Need to hold Lock(); + pstd::Status Truncate(uint32_t pro_num, uint64_t pro_offset, uint64_t index = 0) override; + + std::string filename() override { return filename_; } + + // need to hold mutex_ + void SetTerm(uint32_t term) override { + std::lock_guard l(version_->rwlock_); + version_->term_ = term; + version_->StableSave(); + } + + uint32_t term() override { + std::shared_lock l(version_->rwlock_); + return version_->term_; + } + + void Close() override; + + private: + pstd::Status Put(const char* item, int len); + pstd::Status EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, int* temp_pro_offset); + static pstd::Status AppendPadding(pstd::WritableFile* file, uint64_t* len); + void InitLogFile(); + + /* + * Produce + */ + pstd::Status Produce(const pstd::Slice& item, int* pro_offset); + + std::atomic opened_; + + std::unique_ptr version_; + std::unique_ptr queue_; + // versionfile_ can only be used as a shared_ptr, and it will be used as a variable version_ in the ~Version() function. + std::shared_ptr versionfile_; + + pstd::Mutex mutex_; + + uint32_t pro_num_ = 0; + + int block_offset_ = 0; + + const std::string binlog_path_; + + uint64_t file_size_ = 0; + + std::string filename_; + + std::atomic binlog_io_error_; + + std::unordered_map binlog_to_keep_; +}; + +#endif diff --git a/include/pika_cloud_binlog_transverter.h b/include/pika_cloud_binlog_transverter.h new file mode 100644 index 0000000000..8d94a3c73d --- /dev/null +++ b/include/pika_cloud_binlog_transverter.h @@ -0,0 +1,31 @@ +// Copyright (c) 2018-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef PIKA_CLOUD_BINLOG_TRANSVERTER_H_ +#define PIKA_CLOUD_BINLOG_TRANSVERTER_H_ + +#include + +#include +#include +#include + +#include "pika_cloud_binlog.pb.h" + +class PikaCloudBinlogTransverter { + public: + PikaCloudBinlogTransverter() = default; + static std::string BinlogEncode(uint32_t db_id, uint32_t rocksdb_id, uint32_t exec_time, uint32_t term_id, + uint32_t filenum, uint64_t offset, const std::string& content, uint32_t type); + + static bool BinlogDecode(const std::string& binlog, cloud::BinlogCloudItem* binlog_item); + + static std::string ConstructPaddingBinlog(uint32_t size); + + static bool BinlogItemWithoutContentDecode(const std::string& binlog, + cloud::BinlogCloudItem* binlog_item); +}; + +#endif diff --git a/include/pika_command.h b/include/pika_command.h index 3d5d535971..ce6a4474d3 100644 --- a/include/pika_command.h +++ b/include/pika_command.h @@ -248,6 +248,8 @@ const std::string kCmdNameXInfo = "xinfo"; const std::string kClusterPrefix = "pkcluster"; +const std::string kCmdPkPing = "pkping"; + using PikaCmdArgsType = net::RedisCmdArgsType; static const int RAW_ARGS_LEN = 1024 * 1024; diff --git a/include/pika_conf.h b/include/pika_conf.h index 6b0917ee76..ae62366094 100644 --- a/include/pika_conf.h +++ b/include/pika_conf.h @@ -314,6 +314,7 @@ class PikaConf : public pstd::BaseConf { return network_interface_; } int cache_model() { return cache_model_; } + int pika_mode() { return pika_mode_; } int sync_window_size() { return sync_window_size_.load(); } int max_conn_rbuf_size() { return max_conn_rbuf_size_.load(); } int consensus_level() { return consensus_level_.load(); } @@ -350,9 +351,13 @@ class PikaConf : public pstd::BaseConf { void SetCacheMaxmemoryPolicy(const int value) { cache_maxmemory_policy_ = value; } void SetCacheMaxmemorySamples(const int value) { cache_maxmemory_samples_ = value; } void SetCacheLFUDecayTime(const int value) { cache_lfu_decay_time_ = value; } + void SetPikaMode(const int value) { pika_mode_ = value; } void UnsetCacheDisableFlag() { tmp_cache_disable_flag_ = false; } bool enable_blob_files() { return enable_blob_files_; } int64_t min_blob_size() { return min_blob_size_; } +#ifdef USE_S3 + int64_t SSTCacheSize() const { return sst_cache_size_; } +#endif int64_t blob_file_size() { return blob_file_size_; } std::string blob_compression_type() { return blob_compression_type_; } bool enable_blob_garbage_collection() { return enable_blob_garbage_collection_; } @@ -396,6 +401,17 @@ class PikaConf : public pstd::BaseConf { uint32_t acl_pubsub_default() { return acl_pubsub_default_.load(); } uint32_t acl_log_max_len() { return acl_Log_max_len_.load(); } + // rocksdb-cloud options + std::string cloud_endpoint_override() { return cloud_endpoint_override_; } + std::string cloud_access_key() { return cloud_access_key_; } + std::string cloud_secret_key() { return cloud_secret_key_; } + std::string cloud_src_bucket_prefix() { return cloud_src_bucket_prefix_; } + std::string cloud_src_bucket_suffix() { return cloud_src_bucket_suffix_; } + std::string cloud_src_bucket_region() { return cloud_src_bucket_region_; } + std::string cloud_dest_bucket_prefix() { return cloud_dest_bucket_prefix_; } + std::string cloud_dest_bucket_suffix() { return cloud_dest_bucket_suffix_; } + std::string cloud_dest_bucket_region() { return cloud_dest_bucket_region_; } + // Setter void SetPort(const int value) { std::lock_guard l(rwlock_); @@ -798,6 +814,9 @@ class PikaConf : public pstd::BaseConf { std::atomic_int cache_maxmemory_samples_; std::atomic_int cache_lfu_decay_time_; + //pika mode + int32_t pika_mode_; + // rocksdb blob bool enable_blob_files_ = false; bool enable_blob_garbage_collection_ = false; @@ -809,6 +828,23 @@ class PikaConf : public pstd::BaseConf { int64_t blob_file_size_ = 256 * 1024 * 1024; // 256M std::string blob_compression_type_ = "none"; +#ifdef USE_S3 + int64_t sst_cache_size_ = 10LL << 30; +#endif + + // rocksdb-cloud options + std::string cloud_endpoint_override_; + std::string cloud_access_key_; + std::string cloud_secret_key_; + // rocksdb-cloud src bucket + std::string cloud_src_bucket_prefix_ = "pika."; + std::string cloud_src_bucket_suffix_ = "database"; + std::string cloud_src_bucket_region_; + // rocksdb-cloud dest bucket + std::string cloud_dest_bucket_prefix_ = "pika."; + std::string cloud_dest_bucket_suffix_ = "database"; + std::string cloud_dest_bucket_region_; + std::shared_mutex rwlock_; // Rsync Rate limiting configuration diff --git a/include/pika_db.h b/include/pika_db.h index 8280b6bf38..1e33d41e56 100644 --- a/include/pika_db.h +++ b/include/pika_db.h @@ -94,6 +94,7 @@ class DB : public std::enable_shared_from_this, public pstd::noncopyable { std::shared_ptr storage() const; void GetBgSaveMetaData(std::vector* fileNames, std::string* snapshot_uuid); void BgSaveDB(); + void CloudBgSaveDB(); void SetBinlogIoError(); void SetBinlogIoErrorrelieve(); bool IsBinlogIoError(); @@ -154,6 +155,13 @@ class DB : public std::enable_shared_from_this, public pstd::noncopyable { bool IsBgSaving(); BgSaveInfo bgsave_info(); pstd::Status GetKeyNum(std::vector* key_info); + /* + * Switch Master/Slave role use + */ + rocksdb::Status SwitchMaster(bool is_old_master, bool is_new_master); + + rocksdb::Status ApplyWAL(int rocksdb_id, + int type, const std::string& content); private: bool opened_ = false; @@ -187,12 +195,15 @@ class DB : public std::enable_shared_from_this, public pstd::noncopyable { * BgSave use */ static void DoBgSave(void* arg); + static void DoCloudBgSave(void* arg); bool RunBgsaveEngine(); + void RunCloudBgsaveEngine(rocksdb::CloudFileSystemOptions& cloud_fs_options); bool InitBgsaveEnv(); bool InitBgsaveEngine(); void ClearBgsave(); void FinishBgsave(); + void FinishCloudBgsave(); BgSaveInfo bgsave_info_; pstd::Mutex bgsave_protector_; std::shared_ptr bgsave_engine_; @@ -200,6 +211,7 @@ class DB : public std::enable_shared_from_this, public pstd::noncopyable { struct BgTaskArg { std::shared_ptr db; + rocksdb::CloudFileSystemOptions cloud_fs_options; }; #endif diff --git a/include/pika_define.h b/include/pika_define.h index 176b371111..73c10e6031 100644 --- a/include/pika_define.h +++ b/include/pika_define.h @@ -46,11 +46,28 @@ struct DBStruct { DBStruct(std::string tn, int32_t inst_num) : db_name(std::move(tn)), db_instance_num(inst_num) {} + DBStruct(std::string tn, int32_t inst_num, std::string cloud_endpoint_override, std::string cloud_bucket_prefix, + std::string cloud_bucket_suffix, std::string cloud_bucket_region) + : db_name(std::move(tn)), + db_instance_num(inst_num), + cloud_endpoint_override(std::move(cloud_endpoint_override)), + cloud_bucket_prefix(std::move(cloud_bucket_prefix)), + cloud_bucket_suffix(std::move(cloud_bucket_suffix)), + cloud_bucket_region(std::move(cloud_bucket_region)) {} + bool operator==(const DBStruct& db_struct) const { - return db_name == db_struct.db_name && db_instance_num == db_struct.db_instance_num; + return db_name == db_struct.db_name && db_instance_num == db_struct.db_instance_num && + cloud_endpoint_override == db_struct.cloud_endpoint_override && + cloud_bucket_prefix == db_struct.cloud_bucket_prefix && + cloud_bucket_suffix == db_struct.cloud_bucket_suffix && cloud_bucket_region == db_struct.cloud_bucket_region; } + std::string db_name; int32_t db_instance_num = 0; + std::string cloud_endpoint_override; + std::string cloud_bucket_prefix; + std::string cloud_bucket_suffix; + std::string cloud_bucket_region; }; struct SlaveItem { @@ -309,6 +326,17 @@ const int PIKA_ROLE_MASTER = 2; */ constexpr int PIKA_CACHE_NONE = 0; constexpr int PIKA_CACHE_READ = 1; +/* + * cloud model + */ +constexpr int PIKA_LOCAL = 0; +constexpr int PIKA_CLOUD = 1; + +/* + * cloud tmp conf + * todo: TBD based on deployment status + */ +const std::string kRegion = "us-west-2"; /* * cache size diff --git a/include/pika_server.h b/include/pika_server.h index 34145fc171..27830b2426 100644 --- a/include/pika_server.h +++ b/include/pika_server.h @@ -16,6 +16,10 @@ #include #include +#ifdef USE_S3 +#include +#endif + #include "src/cache/include/config.h" #include "net/include/bg_thread.h" #include "net/include/net_pubsub.h" @@ -61,6 +65,7 @@ enum TaskType { kStartKeyScan, kStopKeyScan, kBgSave, + kCloudBgSave, kCompactRangeStrings, kCompactRangeHashes, kCompactRangeSets, @@ -307,6 +312,12 @@ class PikaServer : public pstd::noncopyable { std::shared_mutex bgsave_protector_; BgSaveInfo bgsave_info_; +#ifdef USE_S3 + bool UploadMetaToSentinel(const std::string& s3_bucket, const std::string& remote_path, + const std::string& content); +#endif + + /* * BGSlotsReload used */ @@ -454,6 +465,7 @@ class PikaServer : public pstd::noncopyable { friend class InfoCmd; friend class PikaReplClientConn; friend class PkClusterInfoCmd; + friend class SlaveofCmd; struct BGCacheTaskArg { BGCacheTaskArg() : conf(nullptr), reenable_cache(false) {} @@ -491,6 +503,13 @@ class PikaServer : public pstd::noncopyable { */ int64_t GetLastSave() const {return lastsave_;} void UpdateLastSave(int64_t lastsave) {lastsave_ = lastsave;} + + /*term_id used*/ +#ifdef USE_S3 + void set_lease_term_id(const int lease_term_id) {lease_term_id_ = lease_term_id;} + void set_group_id(const int group_id) {group_id_ = group_id;} +#endif + private: /* * TimingTask use @@ -507,6 +526,14 @@ class PikaServer : public pstd::noncopyable { int port_ = 0; time_t start_time_s_ = 0; +#ifdef USE_S3 + std::string sentinel_addr_; + //TODO(wangshaoyi): make it thread loacal + std::shared_ptr sentinel_client_; + int lease_term_id_; + int group_id_; +#endif + std::shared_mutex storage_options_rw_; storage::StorageOptions storage_options_; void InitStorageOptions(); diff --git a/src/pika_admin.cc b/src/pika_admin.cc index d25b9459e4..b0f4c92aa6 100644 --- a/src/pika_admin.cc +++ b/src/pika_admin.cc @@ -12,17 +12,25 @@ #include #include +#include +#include +#include +#include +#include +#include +#include #include #include "include/build_version.h" #include "include/pika_cmd_table_manager.h" +#include "include/pika_conf.h" #include "include/pika_rm.h" #include "include/pika_server.h" #include "include/pika_version.h" -#include "include/pika_conf.h" #include "pstd/include/rsync.h" using pstd::Status; +using namespace Aws::Utils; extern PikaServer* g_pika_server; extern std::unique_ptr g_pika_rm; @@ -147,9 +155,18 @@ void SlaveofCmd::Do() { return; } + bool is_old_master = !(g_pika_server->role() == PIKA_ROLE_SLAVE); + LOG(WARNING) << "slaveofcmd, currently role: " << g_pika_server->role(); + g_pika_server->RemoveMaster(); if (is_none_) { + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + std::shared_lock rwl(g_pika_server->dbs_rw_); + for (const auto& db_item : g_pika_server->dbs_) { + db_item.second->SwitchMaster(is_old_master, true); + } + } res_.SetRes(CmdRes::kOk); g_pika_conf->SetSlaveof(std::string()); return; @@ -159,8 +176,14 @@ void SlaveofCmd::Do() { * the data synchronization was successful, but only changes the status of the * slaveof executor to slave */ - bool sm_ret = g_pika_server->SetMaster(master_ip_, static_cast(master_port_)); + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + std::shared_lock rwl(g_pika_server->dbs_rw_); + for (const auto& db_item : g_pika_server->dbs_) { + db_item.second->SwitchMaster(is_old_master, false); + } + } + bool sm_ret = g_pika_server->SetMaster(master_ip_, static_cast(master_port_)); if (sm_ret) { res_.SetRes(CmdRes::kOk); g_pika_server->ClearCacheDbAsync(db_); @@ -176,6 +199,7 @@ void SlaveofCmd::Do() { * dbslaveof db[0 ~ 7] force * dbslaveof db[0 ~ 7] no one * dbslaveof db[0 ~ 7] filenum offset + * Command is deprecated. */ void DbSlaveofCmd::DoInitial() { if (!CheckArg(argv_.size())) { @@ -318,7 +342,11 @@ void BgsaveCmd::DoInitial() { } void BgsaveCmd::Do() { - g_pika_server->DoSameThingSpecificDB(bgsave_dbs_, {TaskType::kBgSave}); + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + g_pika_server->DoSameThingSpecificDB(bgsave_dbs_, {TaskType::kCloudBgSave}); + } else { + g_pika_server->DoSameThingSpecificDB(bgsave_dbs_, {TaskType::kBgSave}); + } LogCommand(); res_.AppendContent("+Background saving started"); } @@ -580,7 +608,9 @@ void FlushallCmd::FlushAllWithoutLock() { return; } DoWithoutLock(db); +#ifndef USE_S3 DoBinlog(g_pika_rm->GetSyncMasterDBs()[p_info]); +#endif } if (res_.ok()) { res_.SetRes(CmdRes::kOk); @@ -667,7 +697,9 @@ void FlushdbCmd::FlushAllDBsWithoutLock() { return; } DoWithoutLock(); +#ifndef USE_S3 DoBinlog(); +#endif } void FlushdbCmd::DoWithoutLock() { @@ -2761,6 +2793,72 @@ void DelbackupCmd::DoInitial() { } void DelbackupCmd::Do() { + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + Aws::SDKOptions options; + Aws::InitAPI(options); + + Aws::Client::ClientConfiguration cfg; + cfg.endpointOverride = g_pika_conf->cloud_endpoint_override(); + cfg.scheme = Aws::Http::Scheme::HTTP; + cfg.verifySSL = false; + + Aws::Auth::AWSCredentials cred(g_pika_conf->cloud_access_key(), + g_pika_conf->cloud_secret_key()); + Aws::S3::S3Client s3_client(cred, cfg, + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + false, Aws::S3::US_EAST_1_REGIONAL_ENDPOINT_OPTION::NOT_SET); + + std::string bucket_name = g_pika_server->storage_options().cloud_fs_options.src_bucket.GetBucketName(); + Aws::S3::Model::DeleteBucketRequest request_del_bucket; + Aws::S3::Model::ListObjectsRequest request_list_object; + + request_del_bucket.SetBucket(bucket_name); + request_list_object.SetBucket(bucket_name); + + bool truncated = false; + //list object and delete file + do { + auto list_objects = s3_client.ListObjects(request_list_object); + if (list_objects.IsSuccess()) { + for (const auto& object : list_objects.GetResult().GetContents()) + { + Aws::S3::Model::DeleteObjectRequest request_del_object; + request_del_object.SetBucket(bucket_name); + request_del_object.SetKey(object.GetKey()); + auto object_del_result = s3_client.DeleteObject(request_del_object); + if (!object_del_result.IsSuccess()) { + res_.SetRes(CmdRes::kErrOther, "DeleteFile error: " + object_del_result.GetError().GetMessage()); + Aws::ShutdownAPI(options); + return; + } + } + + // check if the next page is empty + truncated = list_objects.GetResult().GetIsTruncated(); + if (truncated) { + request_list_object.SetMarker(list_objects.GetResult().GetNextMarker()); + } + } else { + res_.SetRes(CmdRes::kErrOther, "ListObjects error: " + list_objects.GetError().GetMessage()); + Aws::ShutdownAPI(options); + return; + } + } while (truncated); + + //del bucket + //todo: At present, this operation is not supported online. + // It will be modified according to deployment in the future + auto bucket_del_result = s3_client.DeleteBucket(request_del_bucket); + if (!bucket_del_result.IsSuccess()) { + res_.SetRes(CmdRes::kErrOther, "DeleteBucket error: " + bucket_del_result.GetError().GetMessage()); + } else { + res_.SetRes(CmdRes::kOk); + } + + Aws::ShutdownAPI(options); + return; + } + std::string db_sync_prefix = g_pika_conf->bgsave_prefix(); std::string db_sync_path = g_pika_conf->bgsave_path(); std::vector dump_dir; @@ -2899,6 +2997,9 @@ void PaddingCmd::DoInitial() { void PaddingCmd::Do() { res_.SetRes(CmdRes::kOk); } std::string PaddingCmd::ToRedisProtocol() { + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + return PikaBinlogTransverter::ConstructPaddingBinlog(BinlogType::TypeFirst, argv_[1].size()); + } return PikaBinlogTransverter::ConstructPaddingBinlog( BinlogType::TypeFirst, argv_[1].size() + BINLOG_ITEM_HEADER_SIZE + PADDING_BINLOG_PROTOCOL_SIZE + SPACE_STROE_PARAMETER_LENGTH); @@ -3213,6 +3314,55 @@ void ClearCacheCmd::Do() { res_.SetRes(CmdRes::kOk, "Cache is cleared"); } +void PKPingCmd::DoInitial() { + if (!CheckArg(argv_.size())) { + res_.SetRes(CmdRes::kWrongNum, kCmdPkPing); + return; + } + + Json::JsonValue json_str(argv_[1]); + Json::JsonView jw(json_str); + + group_id_ = jw.GetInt64("group_id"); + term_id_ = jw.GetInt64("term_id"); + if (jw.ValueExists("masters_addr")) { + auto jsonArrayView = jw.GetArray("masters_addr"); + size_t arraySize = jsonArrayView.GetLength(); + for (size_t i = 0; i < arraySize; ++i) { + if (jsonArrayView[i].IsString()) { + masters_addr_.push_back(jsonArrayView[i].AsString()); + } + } + } + + if (jw.ValueExists("slaves_addr")) { + auto jsonArrayView = jw.GetArray("slaves_addr"); + size_t arraySize = jsonArrayView.GetLength(); + for (size_t i = 0; i < arraySize; ++i) { + if (jsonArrayView[i].IsString()) { + slaves_addr_.push_back(jsonArrayView[i].AsString()); + } + } + } + + if (g_pika_conf->pika_mode() == PIKA_CLOUD + && g_pika_server->role() == PIKA_ROLE_MASTER) { + for (auto const& slave : g_pika_server->slaves_) { + if (std::find(masters_addr_.begin(), masters_addr_.end(), slave.ip_port) != masters_addr_.end()) { + g_pika_server->set_group_id(group_id_); + g_pika_server->set_lease_term_id(term_id_); + } + } + } +} + +void PKPingCmd::Do() { + std::string info; + InfoCmd cmd(kCmdNameSlotsInfo, -1, kCmdFlagsRead | kCmdFlagsAdmin | kCmdFlagsSlow); + cmd.InfoReplication(info); + res_.AppendString(info); +} + #ifdef WITH_COMMAND_DOCS bool CommandCmd::CommandFieldCompare::operator()(const std::string& a, const std::string& b) const { diff --git a/src/pika_binlog.cc b/src/pika_binlog.cc index d0a612f24c..7972a2ae6b 100644 --- a/src/pika_binlog.cc +++ b/src/pika_binlog.cc @@ -68,9 +68,10 @@ Binlog::Binlog(std::string binlog_path, const int file_size) // To intergrate with old version, we don't set mmap file size to 100M; // pstd::SetMmapBoundSize(file_size); // pstd::kMmapBoundSize = 1024 * 1024 * 100; + // bin log not init + if (binlog_path_ == "" || file_size_ == 0) return; Status s; - pstd::CreateDir(binlog_path_); filename_ = binlog_path_ + kBinlogPrefix; @@ -164,6 +165,14 @@ Status Binlog::GetProducerStatus(uint32_t* filenum, uint64_t* pro_offset, uint32 return Status::OK(); } +Status Binlog::GetOldestBinlogToKeep(uint32_t* filenum, uint32_t* term, uint64_t* logic_id) { + return Status::NotSupported("not supported in local mode"); +} + +Status Binlog::Put(const std::string& item, uint32_t db_id, uint32_t rocksdb_id, uint32_t type) { + return Status::Error("data err"); +} + // Note: mutex lock should be held Status Binlog::Put(const std::string& item) { if (!opened_.load()) { diff --git a/src/pika_binlog_transverter.cc b/src/pika_binlog_transverter.cc index a6f3d2b271..5efd918482 100644 --- a/src/pika_binlog_transverter.cc +++ b/src/pika_binlog_transverter.cc @@ -6,14 +6,13 @@ #include "include/pika_binlog_transverter.h" #include + #include #include #include "pstd/include/pstd_coding.h" - #include "include/pika_command.h" #include "include/pika_define.h" -#include "storage/storage.h" uint32_t BinlogItem::exec_time() const { return exec_time_; } diff --git a/src/pika_cloud_binlog.cc b/src/pika_cloud_binlog.cc new file mode 100644 index 0000000000..03fb4b87f2 --- /dev/null +++ b/src/pika_cloud_binlog.cc @@ -0,0 +1,454 @@ +// Copyright (c) 2015-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#include "include/pika_cloud_binlog.h" + +#include +#include +#include + +#include + +#include "pstd/include/pstd_defer.h" +#include "pstd_status.h" +#include "include/pika_cloud_binlog_transverter.h" + + +using pstd::Status; + +std::string NewCloudFileName(const std::string& name, const uint32_t current) { + char buf[256]; + snprintf(buf, sizeof(buf), "%s%u", name.c_str(), current); + return {buf}; +} + +/* + * CloudVersion + */ +CloudVersion::CloudVersion(const std::shared_ptr& save) : save_(save) { assert(save_ != nullptr); } + +CloudVersion::~CloudVersion() { StableSave(); } + +Status CloudVersion::StableSave() { + char* p = save_->GetData(); + memcpy(p, &pro_num_, sizeof(uint32_t)); + p += 4; + memcpy(p, &pro_offset_, sizeof(uint64_t)); + p += 8; + memcpy(p, &term_, sizeof(uint32_t)); + p += 4; + memcpy(p, &keep_filenum_, sizeof(uint32_t)); + return Status::OK(); +} + +Status CloudVersion::Init() { + Status s; + if (save_->GetData()) { + memcpy(reinterpret_cast(&pro_num_), save_->GetData(), sizeof(uint32_t)); + memcpy(reinterpret_cast(&pro_offset_), save_->GetData() + 4, sizeof(uint64_t)); + memcpy(reinterpret_cast(&term_), save_->GetData() + 12, sizeof(uint32_t)); + memcpy(reinterpret_cast(&keep_filenum_), save_->GetData() + 16, sizeof(uint32_t)); + return Status::OK(); + } else { + return Status::Corruption("version init error"); + } +} + +/* + * Binlog + */ + +CloudBinlog::CloudBinlog(std::string binlog_path, const int file_size) + : Binlog("", 0), + opened_(false), + binlog_path_(std::move(binlog_path)), + file_size_(file_size), + binlog_io_error_(false) { + // To intergrate with old version, we don't set mmap file size to 100M; + // pstd::SetMmapBoundSize(file_size); + // pstd::kMmapBoundSize = 1024 * 1024 * 100; + // bin log not init + if (binlog_path_ == "" || file_size_ == 0) return; + + Status s; + pstd::CreateDir(binlog_path_); + + filename_ = binlog_path_ + kBinlogPrefix; + const std::string manifest = binlog_path_ + kManifest; + std::string profile; + + if (!pstd::FileExists(manifest)) { + LOG(INFO) << "Cloud Binlog: Manifest file not exist, we create a new one."; + + profile = NewFileName(filename_, pro_num_); + s = pstd::NewWritableFile(profile, queue_); + if (!s.ok()) { + LOG(FATAL) << "Cloud Binlog: new " << filename_ << " " << s.ToString(); + } + std::unique_ptr tmp_file; + s = pstd::NewRWFile(manifest, tmp_file); + versionfile_.reset(tmp_file.release()); + if (!s.ok()) { + LOG(FATAL) << "Cloud Binlog: new versionfile error " << s.ToString(); + } + + version_ = std::make_unique(versionfile_); + version_->StableSave(); + } else { + LOG(INFO) << "Cloud Binlog: Find the exist file."; + std::unique_ptr tmp_file; + s = pstd::NewRWFile(manifest, tmp_file); + versionfile_.reset(tmp_file.release()); + if (s.ok()) { + version_ = std::make_unique(versionfile_); + version_->Init(); + pro_num_ = version_->pro_num_; + + // Debug + // version_->debug(); + } else { + LOG(FATAL) << "Cloud Binlog: open versionfile error"; + } + + profile = NewFileName(filename_, pro_num_); + DLOG(INFO) << "Cloud Binlog: open profile " << profile; + s = pstd::AppendWritableFile(profile, queue_, version_->pro_offset_); + if (!s.ok()) { + LOG(FATAL) << "Cloud Binlog: Open file " << profile << " error " << s.ToString(); + } + + uint64_t filesize = queue_->Filesize(); + DLOG(INFO) << "Cloud Binlog: filesize is " << filesize; + } + + InitLogFile(); +} + +CloudBinlog::~CloudBinlog() { + std::lock_guard l(mutex_); + Close(); +} + +void CloudBinlog::Close() { + if (!opened_.load()) { + return; + } + opened_.store(false); +} + +void CloudBinlog::InitLogFile() { + assert(queue_ != nullptr); + uint64_t filesize = queue_->Filesize(); + block_offset_ = static_cast(filesize % kBlockSize); + opened_.store(true); +} + +Status CloudBinlog::GetProducerStatus(uint32_t* filenum, uint64_t* pro_offset, uint32_t* term, uint64_t* logic_id) { + if (!opened_.load()) { + return Status::Busy("Cloud Binlog is not open yet"); + } + + std::shared_lock l(version_->rwlock_); + + *filenum = version_->pro_num_; + *pro_offset = version_->pro_offset_; + if (term) { + *term = version_->term_; + } + + return Status::OK(); +} + +Status CloudBinlog::GetOldestBinlogToKeep(uint32_t* filenum, uint32_t* term, uint64_t* logic_id) { + if (!opened_.load()) { + return Status::Busy("Cloud Binlog is not open yet"); + } + + std::shared_lock l(version_->rwlock_); + *filenum = version_->keep_filenum_; + if (term) { + *term = version_->term_; + } + return Status::OK(); +} + +Status CloudBinlog::Put(const std::string& item) { + if (!opened_.load()) { + return Status::Busy("Cloud Binlog is not open yet"); + } + + Lock(); + DEFER { Unlock(); }; + + Status s = Put(item.c_str(), static_cast(item.size())); + if (!s.ok()) { + binlog_io_error_.store(true); + } + return s; +} +// Note: mutex lock should be held +Status CloudBinlog::Put(const std::string& item, uint32_t db_id, uint32_t rocksdb_id, uint32_t type) { + if (!opened_.load()) { + return Status::Busy("Cloud Binlog is not open yet"); + } + uint32_t filenum = 0; + uint32_t term = 0; + uint64_t offset = 0; + + Lock(); + DEFER { Unlock(); }; + + Status s = GetProducerStatus(&filenum, &offset, &term, nullptr); + if (!s.ok()) { + return s; + } + std::string data = PikaCloudBinlogTransverter::BinlogEncode(db_id, rocksdb_id, time(nullptr), term, filenum, offset, item, type); + + s = Put(data.c_str(), static_cast(data.size())); + if (!s.ok()) { + binlog_io_error_.store(true); + } + // record first binlog item and manifest update binlog item + if (type != 0 || binlog_to_keep_.find(rocksdb_id) == binlog_to_keep_.end()) { + binlog_to_keep_[rocksdb_id] = filenum; + } + + uint32_t keep_filenum = binlog_to_keep_.begin()->second; + for (const auto& offset : binlog_to_keep_) { + keep_filenum = std::min(keep_filenum, offset.second); + } + + version_->keep_filenum_ = keep_filenum; + return s; +} + +// Note: mutex lock should be held +Status CloudBinlog::Put(const char* item, int len) { + Status s; + /* Check to roll log file */ + uint64_t filesize = queue_->Filesize(); + if (filesize > file_size_) { + std::unique_ptr queue; + std::string profile = NewCloudFileName(filename_, pro_num_ + 1); + s = pstd::NewWritableFile(profile, queue); + if (!s.ok()) { + LOG(ERROR) << "Cloud Binlog: new " << filename_ << " " << s.ToString(); + return s; + } + queue_.reset(); + queue_ = std::move(queue); + pro_num_++; + + { + std::lock_guard l(version_->rwlock_); + version_->pro_offset_ = 0; + version_->pro_num_ = pro_num_; + version_->StableSave(); + } + InitLogFile(); + } + + int pro_offset = 0; + s = Produce(pstd::Slice(item, len), &pro_offset); + if (s.ok()) { + std::lock_guard l(version_->rwlock_); + version_->pro_offset_ = pro_offset; + version_->StableSave(); + } + + return s; +} + +Status CloudBinlog::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, int* temp_pro_offset) { + Status s; + assert(n <= 0xffffff); + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + char buf[kHeaderSize]; + + uint64_t now = 0; + struct timeval tv; + gettimeofday(&tv, nullptr); + now = tv.tv_sec; + buf[0] = static_cast(n & 0xff); + buf[1] = static_cast((n & 0xff00) >> 8); + buf[2] = static_cast(n >> 16); + buf[3] = static_cast(now & 0xff); + buf[4] = static_cast((now & 0xff00) >> 8); + buf[5] = static_cast((now & 0xff0000) >> 16); + buf[6] = static_cast((now & 0xff000000) >> 24); + buf[7] = static_cast(t); + + s = queue_->Append(pstd::Slice(buf, kHeaderSize)); + if (s.ok()) { + s = queue_->Append(pstd::Slice(ptr, n)); + if (s.ok()) { + s = queue_->Flush(); + } + } + block_offset_ += static_cast(kHeaderSize + n); + + *temp_pro_offset += static_cast(kHeaderSize + n); + return s; +} + +Status CloudBinlog::Produce(const pstd::Slice& item, int* temp_pro_offset) { + Status s; + const char* ptr = item.data(); + size_t left = item.size(); + bool begin = true; + + *temp_pro_offset = static_cast(version_->pro_offset_); + do { + const int leftover = static_cast(kBlockSize) - block_offset_; + assert(leftover >= 0); + if (static_cast(leftover) < kHeaderSize) { + if (leftover > 0) { + s = queue_->Append(pstd::Slice("\x00\x00\x00\x00\x00\x00\x00", leftover)); + if (!s.ok()) { + return s; + } + *temp_pro_offset += leftover; + } + block_offset_ = 0; + } + + const size_t avail = kBlockSize - block_offset_ - kHeaderSize; + const size_t fragment_length = (left < avail) ? left : avail; + RecordType type; + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length, temp_pro_offset); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && left > 0); + + return s; +} + +Status CloudBinlog::AppendPadding(pstd::WritableFile* file, uint64_t* len) { + if (*len < kHeaderSize) { + return Status::OK(); + } + + Status s; + char buf[kBlockSize]; + uint64_t now = 0; + struct timeval tv; + gettimeofday(&tv, nullptr); + now = tv.tv_sec; + + uint64_t left = *len; + while (left > 0 && s.ok()) { + uint32_t size = (left >= kBlockSize) ? kBlockSize : left; + if (size < kHeaderSize) { + break; + } else { + uint32_t bsize = size - kHeaderSize; + std::string binlog(bsize, '*'); + buf[0] = static_cast(bsize & 0xff); + buf[1] = static_cast((bsize & 0xff00) >> 8); + buf[2] = static_cast(bsize >> 16); + buf[3] = static_cast(now & 0xff); + buf[4] = static_cast((now & 0xff00) >> 8); + buf[5] = static_cast((now & 0xff0000) >> 16); + buf[6] = static_cast((now & 0xff000000) >> 24); + // kBadRecord here + buf[7] = static_cast(kBadRecord); + s = file->Append(pstd::Slice(buf, kHeaderSize)); + if (s.ok()) { + s = file->Append(pstd::Slice(binlog.data(), binlog.size())); + if (s.ok()) { + s = file->Flush(); + left -= size; + } + } + } + } + *len -= left; + if (left != 0) { + LOG(WARNING) << "Cloud AppendPadding left bytes: " << left << " is less then kHeaderSize"; + } + return s; +} + +Status CloudBinlog::SetProducerStatus(uint32_t pro_num, uint64_t pro_offset, uint32_t term, uint64_t index) { + if (!opened_.load()) { + return Status::Busy("Cloud Binlog is not open yet"); + } + + std::lock_guard l(mutex_); + + // offset smaller than the first header + if (pro_offset < 4) { + pro_offset = 0; + } + + queue_.reset(); + + std::string init_profile = NewCloudFileName(filename_, 0); + if (pstd::FileExists(init_profile)) { + pstd::DeleteFile(init_profile); + } + + std::string profile = NewCloudFileName(filename_, pro_num); + if (pstd::FileExists(profile)) { + pstd::DeleteFile(profile); + } + + pstd::NewWritableFile(profile, queue_); + CloudBinlog::AppendPadding(queue_.get(), &pro_offset); + + pro_num_ = pro_num; + + { + std::lock_guard l(version_->rwlock_); + version_->pro_num_ = pro_num; + version_->pro_offset_ = pro_offset; + version_->term_ = term; + version_->StableSave(); + } + + InitLogFile(); + return Status::OK(); +} + +Status CloudBinlog::Truncate(uint32_t pro_num, uint64_t pro_offset, uint64_t index) { + queue_.reset(); + std::string profile = NewCloudFileName(filename_, pro_num); + const int fd = open(profile.c_str(), O_RDWR | O_CLOEXEC, 0644); + if (fd < 0) { + return Status::IOError("fd open failed"); + } + if (ftruncate(fd, static_cast(pro_offset)) != 0) { + return Status::IOError("ftruncate failed"); + } + close(fd); + + pro_num_ = pro_num; + { + std::lock_guard l(version_->rwlock_); + version_->pro_num_ = pro_num; + version_->pro_offset_ = pro_offset; + version_->StableSave(); + } + + Status s = pstd::AppendWritableFile(profile, queue_, version_->pro_offset_); + if (!s.ok()) { + return s; + } + + InitLogFile(); + + return Status::OK(); +} diff --git a/src/pika_cloud_binlog.proto b/src/pika_cloud_binlog.proto new file mode 100644 index 0000000000..c54df8f5b9 --- /dev/null +++ b/src/pika_cloud_binlog.proto @@ -0,0 +1,17 @@ +syntax = "proto3"; + +package cloud; + +message BinlogCloudItem { + //belong to which db + uint64 db_id = 1; + //belong to whicn rocksdb + uint64 rocksdb_id = 2; + //data write time + uint64 exec_time = 3; + uint64 term_id = 4; + uint64 file_num = 5; + uint64 offset = 6; + bytes content = 7; + uint64 type = 8; +} diff --git a/src/pika_cloud_binlog_transverter.cc b/src/pika_cloud_binlog_transverter.cc new file mode 100644 index 0000000000..498106fcd9 --- /dev/null +++ b/src/pika_cloud_binlog_transverter.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2018-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "include/pika_cloud_binlog_transverter.h" + +#include + +#include + +#include "include/pika_command.h" +#include "pstd/include/pstd_coding.h" + +const int PADDING_BINLOG_PROTOCOL_SIZE = 22; +const int SPACE_STROE_PARAMETER_LENGTH = 5; + +std::string PikaCloudBinlogTransverter::BinlogEncode(uint32_t db_id, uint32_t rocksdb_id, uint32_t exec_time, + uint32_t term_id, uint32_t filenum, uint64_t offset, + const std::string& content, uint32_t type) { + std::string serialize_binlog; + cloud::BinlogCloudItem binlog_item; + binlog_item.set_db_id(db_id); + binlog_item.set_rocksdb_id(rocksdb_id); + binlog_item.set_exec_time(exec_time); + binlog_item.set_term_id(term_id); + binlog_item.set_file_num(filenum); + binlog_item.set_offset(offset); + binlog_item.set_content(content); + binlog_item.set_type(type); + binlog_item.SerializeToString(&serialize_binlog); + return serialize_binlog; +} + +bool PikaCloudBinlogTransverter::BinlogDecode(const std::string& binlog, cloud::BinlogCloudItem* binlog_item) { + auto res = binlog_item->ParseFromString(binlog); + if (!res) { + LOG(ERROR) << "Failed to deserialize cloud binlog item"; + return false; + } + return true; +} + +std::string PikaCloudBinlogTransverter::ConstructPaddingBinlog(uint32_t parameter_len) { + std::string binlog; + cloud::BinlogCloudItem binlog_item; + if (parameter_len < 0) { + return {}; + } + + std::string content; + RedisAppendLen(content, 2, "*"); + RedisAppendLen(content, 7, "$"); + RedisAppendContent(content, "padding"); + + std::string parameter_len_str; + std::ostringstream os; + os << parameter_len; + std::istringstream is(os.str()); + is >> parameter_len_str; + if (parameter_len_str.size() > SPACE_STROE_PARAMETER_LENGTH) { + return {}; + } + + content.append("$"); + content.append(SPACE_STROE_PARAMETER_LENGTH - parameter_len_str.size(), '0'); + content.append(parameter_len_str); + content.append(kNewLine); + RedisAppendContent(content, std::string(parameter_len, '*')); + + BinlogEncode(0, 0, 0, 0, 0, 0, content, 0); + return binlog; +} + +bool PikaCloudBinlogTransverter::BinlogItemWithoutContentDecode(const std::string& binlog, + cloud::BinlogCloudItem* binlog_item) { + auto res = binlog_item->ParseFromString(binlog); + if (!res) { + LOG(ERROR) << "Failed to deserialize cloud binlog item"; + return false; + } + return true; +} diff --git a/src/pika_command.cc b/src/pika_command.cc index b76baca28b..6a6265f211 100644 --- a/src/pika_command.cc +++ b/src/pika_command.cc @@ -54,10 +54,10 @@ void InitCmdTable(CmdTable* cmd_table) { cmd_table->insert(std::pair>(kCmdNameBgsave, std::move(bgsaveptr))); std::unique_ptr compactptr = - std::make_unique(kCmdNameCompact, -1, kCmdFlagsRead | kCmdFlagsAdmin | kCmdFlagsSlow); + std::make_unique(kCmdNameCompact, -1, kCmdFlagsRead | kCmdFlagsAdmin | kCmdFlagsSlow | kCmdFlagsSuspend); cmd_table->insert(std::pair>(kCmdNameCompact, std::move(compactptr))); - std::unique_ptr compactrangeptr = std::make_unique(kCmdNameCompactRange, 5, kCmdFlagsRead | kCmdFlagsAdmin); + std::unique_ptr compactrangeptr = std::make_unique(kCmdNameCompactRange, 5, kCmdFlagsRead | kCmdFlagsAdmin | kCmdFlagsSuspend); cmd_table->insert(std::pair>(kCmdNameCompactRange, std::move(compactrangeptr))); std::unique_ptr purgelogsto = std::make_unique(kCmdNamePurgelogsto, -2, kCmdFlagsRead | kCmdFlagsAdmin); @@ -820,6 +820,10 @@ void InitCmdTable(CmdTable* cmd_table) { std::unique_ptr xinfoptr = std::make_unique(kCmdNameXInfo, -2, kCmdFlagsRead | kCmdFlagsStream | kCmdFlagsSlow); cmd_table->insert(std::pair>(kCmdNameXInfo, std::move(xinfoptr))); + ////PKPING + std::unique_ptr pkpingptr = + std::make_unique(kCmdPkPing, 2, kCmdFlagsRead | kCmdFlagsAdmin | kCmdFlagsSlow); + cmd_table->insert(std::pair>(kCmdPkPing, std::move(pkpingptr))); } Cmd* GetCmdFromDB(const std::string& opt, const CmdTable& cmd_table) { @@ -880,7 +884,9 @@ void Cmd::InternalProcessCommand(const HintKeys& hint_keys) { do_duration_ += pstd::NowMicros() - start_us; } - DoBinlog(); + if (g_pika_conf->pika_mode() == PIKA_LOCAL) { + DoBinlog(); + } if (is_write()) { record_lock.Unlock(current_key()); diff --git a/src/pika_conf.cc b/src/pika_conf.cc index 8086bb2285..c732f4bf15 100644 --- a/src/pika_conf.cc +++ b/src/pika_conf.cc @@ -486,6 +486,8 @@ int PikaConf::Load() { int cache_num = 16; GetConfInt("cache-num", &cache_num); cache_num_ = (0 >= cache_num || 48 < cache_num) ? 16 : cache_num; + //todo: pika init pika_mode from conf + pika_mode_ = PIKA_CLOUD; int cache_model = 0; GetConfInt("cache-model", &cache_model); @@ -550,6 +552,12 @@ int PikaConf::Load() { if (min_blob_size_ <= 0) { min_blob_size_ = 4096; } +#ifdef USE_S3 + GetConfInt64("sst-cache-size", &sst_cache_size_); + if (sst_cache_size_ <= 0) { + sst_cache_size_ = 10LL << 30; + } +#endif GetConfInt64Human("blob-file-size", &blob_file_size_); if (blob_file_size_ <= 0) { blob_file_size_ = 256 * 1024 * 1024; @@ -578,6 +586,17 @@ int PikaConf::Load() { max_rsync_parallel_num_ = 4; } + // rocksdb-cloud options + GetConfStr("cloud-endpoint-override", &cloud_endpoint_override_); + GetConfStr("cloud-access-key", &cloud_access_key_); + GetConfStr("cloud-secret-key", &cloud_secret_key_); + GetConfStr("cloud-src-bucket-prefix", &cloud_src_bucket_prefix_); + GetConfStr("cloud-src-bucket-suffix", &cloud_src_bucket_suffix_); + GetConfStr("cloud-src-bucket-region", &cloud_src_bucket_region_); + GetConfStr("cloud-dest-bucket-prefix", &cloud_dest_bucket_prefix_); + GetConfStr("cloud-dest-bucket-suffix", &cloud_dest_bucket_suffix_); + GetConfStr("cloud-dest-bucket-region", &cloud_dest_bucket_region_); + return ret; } diff --git a/src/pika_consensus.cc b/src/pika_consensus.cc index 3d08a4a642..83247ed1f9 100644 --- a/src/pika_consensus.cc +++ b/src/pika_consensus.cc @@ -12,6 +12,7 @@ #include "include/pika_conf.h" #include "include/pika_rm.h" #include "include/pika_server.h" +#include "pika_codis_slot.h" using pstd::Status; @@ -31,8 +32,10 @@ Status Context::StableSave() { memcpy(p, &(applied_index_.b_offset.offset), sizeof(uint64_t)); p += 8; memcpy(p, &(applied_index_.l_offset.term), sizeof(uint32_t)); - p += 4; - memcpy(p, &(applied_index_.l_offset.index), sizeof(uint64_t)); + if (g_pika_conf->pika_mode() == PIKA_LOCAL) { + p += 4; + memcpy(p, &(applied_index_.l_offset.index), sizeof(uint64_t)); + } return Status::OK(); } @@ -55,13 +58,15 @@ Status Context::Init() { memcpy(reinterpret_cast(&(applied_index_.b_offset.filenum)), save_->GetData(), sizeof(uint32_t)); memcpy(reinterpret_cast(&(applied_index_.b_offset.offset)), save_->GetData() + 4, sizeof(uint64_t)); memcpy(reinterpret_cast(&(applied_index_.l_offset.term)), save_->GetData() + 12, sizeof(uint32_t)); - memcpy(reinterpret_cast(&(applied_index_.l_offset.index)), save_->GetData() + 16, sizeof(uint64_t)); + if (g_pika_conf->pika_mode() == PIKA_LOCAL) { + memcpy(reinterpret_cast(&(applied_index_.l_offset.index)), save_->GetData() + 16, sizeof(uint64_t)); + } return Status::OK(); } else { return Status::Corruption("Context init error"); } } - +//not used func void Context::UpdateAppliedIndex(const LogOffset& offset) { std::lock_guard l(rwlock_); LogOffset cur_offset; diff --git a/src/pika_db.cc b/src/pika_db.cc index ce51132499..eccfd107ea 100644 --- a/src/pika_db.cc +++ b/src/pika_db.cc @@ -7,6 +7,7 @@ #include #include "include/pika_db.h" +#include "storage/storage_define.h" #include "include/pika_cmd_table_manager.h" #include "include/pika_rm.h" @@ -39,7 +40,13 @@ DB::DB(std::string db_name, const std::string& db_path, log_path_ = DBPath(log_path, "log_" + db_name_); storage_ = std::make_shared(g_pika_conf->db_instance_num(), g_pika_conf->default_slot_num(), g_pika_conf->classic_mode()); - rocksdb::Status s = storage_->Open(g_pika_server->storage_options(), db_path_); + rocksdb::Status s; +#ifdef USE_S3 + std::shared_ptr wal_writer = g_pika_rm->GetSyncMasterDBByName(db_name)->StableLogger()->Logger(); + s = storage_->Open(g_pika_server->storage_options(), db_path_, wal_writer); +#else + s = storage_->Open(g_pika_server->storage_options(), db_path_); +#endif pstd::CreatePath(db_path_); pstd::CreatePath(log_path_); lock_mgr_ = std::make_shared(1000, 0, std::make_shared()); @@ -68,6 +75,19 @@ void DB::BgSaveDB() { g_pika_server->BGSaveTaskSchedule(&DoBgSave, static_cast(bg_task_arg)); } +void DB::CloudBgSaveDB() { + std::shared_lock l(dbs_rw_); + std::lock_guard ml(bgsave_protector_); + if (bgsave_info_.bgsaving) { + return; + } + bgsave_info_.bgsaving = true; + auto bg_task_arg = new BgTaskArg(); + bg_task_arg->db = shared_from_this(); + bg_task_arg->cloud_fs_options = g_pika_server->storage_options().cloud_fs_options; + g_pika_server->BGSaveTaskSchedule(&DoCloudBgSave, static_cast(bg_task_arg)); +} + void DB::SetBinlogIoError() { return binlog_io_error_.store(true); } void DB::SetBinlogIoErrorrelieve() { return binlog_io_error_.store(false); } bool DB::IsBinlogIoError() { return binlog_io_error_.load(); } @@ -205,6 +225,12 @@ DisplayCacheInfo DB::GetCacheInfo() { } bool DB::FlushDBWithoutLock() { +#ifdef USE_S3 + LOG(INFO) << db_name_ << " flushing db..."; + auto st = storage_->FlushDB(); + LOG(INFO) << db_name_ << " flushing db done, status: " << st.ToString(); + return st.ok(); +#endif if (bgsave_info_.bgsaving) { return false; } @@ -285,6 +311,12 @@ void DB::DoBgSave(void* arg) { bg_task_arg->db->FinishBgsave(); } +void DB::DoCloudBgSave(void* arg) { + std::unique_ptr bg_task_arg(static_cast(arg)); + bg_task_arg->db->RunCloudBgsaveEngine(bg_task_arg->cloud_fs_options); + bg_task_arg->db->FinishCloudBgsave(); +} + bool DB::RunBgsaveEngine() { // Prepare for Bgsaving if (!InitBgsaveEnv() || !InitBgsaveEngine()) { @@ -297,6 +329,7 @@ bool DB::RunBgsaveEngine() { LOG(INFO) << db_name_ << " bgsave_info: path=" << info.path << ", filenum=" << info.offset.b_offset.filenum << ", offset=" << info.offset.b_offset.offset; +#ifndef USE_S3 // Backup to tmp dir rocksdb::Status s = bgsave_engine_->CreateNewBackup(info.path); @@ -304,11 +337,21 @@ bool DB::RunBgsaveEngine() { LOG(WARNING) << db_name_ << " create new backup failed :" << s.ToString(); return false; } +#endif LOG(INFO) << db_name_ << " create new backup finished."; return true; } +void DB::RunCloudBgsaveEngine(rocksdb::CloudFileSystemOptions& cloud_fs_options) { + rocksdb::Status s = bgsave_engine_->CreateNewCloudBackup(cloud_fs_options, g_pika_conf.get()); + if (!s.ok()) { + LOG(WARNING) << db_name_ << " create new cloud backup failed :" << s.ToString(); + return; + } + LOG(INFO) << db_name_ << " create new cloud backup finished."; +} + BgSaveInfo DB::bgsave_info() { std::lock_guard l(bgsave_protector_); return bgsave_info_; @@ -320,6 +363,11 @@ void DB::FinishBgsave() { g_pika_server->UpdateLastSave(time(nullptr)); } +void DB::FinishCloudBgsave() { + std::lock_guard l(bgsave_protector_); + bgsave_info_.bgsaving = false; +} + // Prepare engine, need bgsave_protector protect bool DB::InitBgsaveEnv() { std::lock_guard l(bgsave_protector_); @@ -363,16 +411,24 @@ bool DB::InitBgsaveEngine() { std::lock_guard lock(db_rwlock_); LogOffset bgsave_offset; // term, index are 0 +#ifdef USE_S3 + db->Logger()->GetOldestBinlogToKeep(&(bgsave_offset.b_offset.filenum)); + PikaBinlogReader::GetFirstOffset(db->Logger(), bgsave_offset.b_offset.filenum, &bgsave_offset.b_offset.offset); + LOG(WARNING) << "bgsave info binlog filenum: " << bgsave_offset.b_offset.filenum << " offset: " << bgsave_offset.b_offset.offset; +#else db->Logger()->GetProducerStatus(&(bgsave_offset.b_offset.filenum), &(bgsave_offset.b_offset.offset)); +#endif { std::lock_guard l(bgsave_protector_); bgsave_info_.offset = bgsave_offset; } +#ifndef USE_S3 s = bgsave_engine_->SetBackupContent(); if (!s.ok()) { LOG(WARNING) << db_name_ << " set backup content failed " << s.ToString(); return false; } +#endif } return true; } @@ -504,11 +560,13 @@ bool DB::TryUpdateMasterOffset() { << ", offset: " << offset << ", term: " << term << ", index: " << index; pstd::DeleteFile(info_path); +#ifndef USE_S3 if (!ChangeDb(dbsync_path_)) { LOG(WARNING) << "DB: " << db_name_ << ", Failed to change db"; slave_db->SetReplState(ReplState::kError); return false; } +#endif // Update master offset std::shared_ptr master_db = @@ -638,3 +696,28 @@ bool DB::FlushDB() { std::lock_guard l(bgsave_protector_); return FlushDBWithoutLock(); } + +rocksdb::Status DB::SwitchMaster(bool is_old_master, bool is_new_master) { + return storage_->SwitchMaster(is_old_master, is_new_master); +} + +rocksdb::Status DB::ApplyWAL(int rocksdb_id, + int type, const std::string& content) { + if (type == static_cast(storage::RocksDBRecordType::kMemtableWrite) && + storage_->ShouldSkip(rocksdb_id, content)) { + return rocksdb::Status::OK(); + } + if (type == static_cast(storage::RocksDBRecordType::kFlushDB)) { + auto s = storage_->FlushDBAtSlave(rocksdb_id); + return s; + } + std::unordered_set redis_keys; + auto s = storage_->ApplyWAL(rocksdb_id, type, content, &redis_keys); + if (!s.ok()) { + return s; + } + for (const auto& key : redis_keys) { + cache_->Del({key}); + } + return s; +} diff --git a/src/pika_inner_message.proto b/src/pika_inner_message.proto index 9e2a3ef04c..db7a87e463 100644 --- a/src/pika_inner_message.proto +++ b/src/pika_inner_message.proto @@ -35,9 +35,14 @@ message Slot { } message DBInfo { - required string db_name = 1; - required uint32 slot_num = 2; - repeated uint32 slot_ids = 3; + required string db_name = 1; + required uint32 slot_num = 2; + repeated uint32 slot_ids = 3; + //s3 info + optional string cloud_endpoint_override = 4; + optional string cloud_bucket_prefix = 5; + optional string cloud_bucket_suffix = 6; + optional string cloud_bucket_region = 7; } message PikaMeta { @@ -114,6 +119,11 @@ message InnerResponse { required string db_name = 1; required int32 slot_num = 2; required int32 db_instance_num = 3; + //s3 info + optional string cloud_endpoint_override = 4; + optional string cloud_bucket_prefix = 5; + optional string cloud_bucket_suffix = 6; + optional string cloud_bucket_region = 7; } required bool classic_mode = 1; repeated DBInfo dbs_info = 2; diff --git a/src/pika_repl_bgworker.cc b/src/pika_repl_bgworker.cc index 4f372351f2..925efb9a25 100644 --- a/src/pika_repl_bgworker.cc +++ b/src/pika_repl_bgworker.cc @@ -5,13 +5,16 @@ #include -#include "include/pika_repl_bgworker.h" +#include "include/pika_cloud_binlog_transverter.h" #include "include/pika_cmd_table_manager.h" +#include "include/pika_conf.h" +#include "include/pika_repl_bgworker.h" #include "include/pika_rm.h" #include "include/pika_server.h" #include "pstd/include/pstd_defer.h" #include "src/pstd/include/scope_record_lock.h" -#include "include/pika_conf.h" +#include "pika_cloud_binlog.pb.h" +#include "storage/storage_define.h" extern PikaServer* g_pika_server; extern std::unique_ptr g_pika_rm; @@ -132,20 +135,46 @@ void PikaReplBgWorker::HandleBGWorkerWriteBinlog(void* arg) { if (binlog_res.binlog().empty()) { continue; } - if (!PikaBinlogTransverter::BinlogItemWithoutContentDecode(TypeFirst, binlog_res.binlog(), &worker->binlog_item_)) { - LOG(WARNING) << "Binlog item decode failed"; - slave_db->SetReplState(ReplState::kTryConnect); - return; - } - const char* redis_parser_start = binlog_res.binlog().data() + BINLOG_ENCODE_LEN; - int redis_parser_len = static_cast(binlog_res.binlog().size()) - BINLOG_ENCODE_LEN; - int processed_len = 0; - net::RedisParserStatus ret = - worker->redis_parser_.ProcessInputBuffer(redis_parser_start, redis_parser_len, &processed_len); - if (ret != net::kRedisParserDone) { - LOG(WARNING) << "Redis parser failed"; - slave_db->SetReplState(ReplState::kTryConnect); - return; + + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + cloud::BinlogCloudItem binlog_item; + if (!PikaCloudBinlogTransverter::BinlogItemWithoutContentDecode(binlog_res.binlog(), &binlog_item)) { + LOG(WARNING) << "Cloud Binlog item decode failed"; + slave_db->SetReplState(ReplState::kTryConnect); + return; + } + + std::shared_ptr db = + g_pika_rm->GetSyncMasterDBByName(DBInfo(worker->db_name_)); + if (!db) { + LOG(WARNING) << worker->db_name_ <<" not found"; + slave_db->SetReplState(ReplState::kTryConnect); + return; + } + db->Logger()->Put(binlog_res.binlog()); + auto s = g_pika_server->GetDB(worker->db_name_)->ApplyWAL(binlog_item.rocksdb_id(), binlog_item.type(), binlog_item.content()); + if (!s.ok()) { + LOG(WARNING) << "applywal at slave node failed, error: " << s.ToString(); + slave_db->SetReplState(ReplState::kTryConnect); + return; + } + } else { + if (!PikaBinlogTransverter::BinlogItemWithoutContentDecode(TypeFirst, binlog_res.binlog(), &worker->binlog_item_)) { + LOG(WARNING) << "Binlog item decode failed"; + slave_db->SetReplState(ReplState::kTryConnect); + return; + } + + const char* redis_parser_start = binlog_res.binlog().data() + BINLOG_ENCODE_LEN; + int redis_parser_len = static_cast(binlog_res.binlog().size()) - BINLOG_ENCODE_LEN; + int processed_len = 0; + net::RedisParserStatus ret = + worker->redis_parser_.ProcessInputBuffer(redis_parser_start, redis_parser_len, &processed_len); + if (ret != net::kRedisParserDone) { + LOG(WARNING) << "Redis parser failed"; + slave_db->SetReplState(ReplState::kTryConnect); + return; + } } } @@ -208,7 +237,6 @@ void PikaReplBgWorker::HandleBGWorkerWriteDB(void* arg) { std::unique_ptr task_arg(static_cast(arg)); const std::shared_ptr c_ptr = task_arg->cmd_ptr; const PikaCmdArgsType& argv = c_ptr->argv(); - LogOffset offset = task_arg->offset; std::string db_name = task_arg->db_name; uint64_t start_us = 0; @@ -221,6 +249,7 @@ void PikaReplBgWorker::HandleBGWorkerWriteDB(void* arg) { if (!c_ptr->IsSuspend()) { c_ptr->GetDB()->DbRWLockReader(); } + if (c_ptr->IsNeedCacheDo() && PIKA_CACHE_NONE != g_pika_conf->cache_model() && c_ptr->GetDB()->cache()->CacheStatus() == PIKA_CACHE_STATUS_OK) { diff --git a/src/pika_repl_client_conn.cc b/src/pika_repl_client_conn.cc index 672648d64d..8db9383c47 100644 --- a/src/pika_repl_client_conn.cc +++ b/src/pika_repl_client_conn.cc @@ -110,7 +110,13 @@ void PikaReplClientConn::HandleMetaSyncResponse(void* arg) { std::vector master_db_structs; for (int idx = 0; idx < meta_sync.dbs_info_size(); ++idx) { const InnerMessage::InnerResponse_MetaSync_DBInfo& db_info = meta_sync.dbs_info(idx); - master_db_structs.push_back({db_info.db_name(), db_info.db_instance_num()}); + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + master_db_structs.push_back({db_info.db_name(), db_info.db_instance_num(), + db_info.cloud_endpoint_override(), db_info.cloud_bucket_prefix(), + db_info.cloud_bucket_suffix(), db_info.cloud_bucket_region()}); + } else { + master_db_structs.push_back({db_info.db_name(), db_info.db_instance_num()}); + } } std::vector self_db_structs = g_pika_conf->db_structs(); diff --git a/src/pika_repl_server_conn.cc b/src/pika_repl_server_conn.cc index 21847db3cd..ed337e9109 100644 --- a/src/pika_repl_server_conn.cc +++ b/src/pika_repl_server_conn.cc @@ -65,6 +65,12 @@ void PikaReplServerConn::HandleMetaSyncRequest(void* arg) { */ db_info->set_slot_num(1); db_info->set_db_instance_num(db_struct.db_instance_num); + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + db_info->set_cloud_endpoint_override(db_struct.cloud_endpoint_override); + db_info->set_cloud_bucket_prefix(db_struct.cloud_bucket_prefix); + db_info->set_cloud_bucket_suffix(db_struct.cloud_bucket_suffix); + db_info->set_cloud_bucket_region(db_struct.cloud_bucket_region); + } } } } @@ -116,9 +122,19 @@ void PikaReplServerConn::HandleTrySyncRequest(void* arg) { << ", pro_offset: " << slave_boffset.offset(); response.set_code(InnerMessage::kOk); } - - if (pre_success && TrySyncOffsetCheck(db, try_sync_request, try_sync_response)) { - TrySyncUpdateSlaveNode(db, try_sync_request, conn, try_sync_response); + //In cloud mode, only full synchronization is possible. + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + if (pre_success) { + if (!db->CheckSlaveNodeExist(node.ip(), node.port())) { + try_sync_response->set_reply_code(InnerMessage::InnerResponse::TrySync::kSyncPointBePurged); + } else if (TrySyncOffsetCheck(db, try_sync_request, try_sync_response)){ + TrySyncUpdateSlaveNode(db, try_sync_request, conn, try_sync_response); + } + } + } else { + if (pre_success && TrySyncOffsetCheck(db, try_sync_request, try_sync_response)) { + TrySyncUpdateSlaveNode(db, try_sync_request, conn, try_sync_response); + } } std::string reply_str; diff --git a/src/pika_rm.cc b/src/pika_rm.cc index e1deb6d8c8..8ebabe32ee 100644 --- a/src/pika_rm.cc +++ b/src/pika_rm.cc @@ -18,6 +18,7 @@ #include "include/pika_server.h" #include "include/pika_admin.h" +#include "include/pika_cloud_binlog_transverter.h" #include "include/pika_command.h" using pstd::Status; @@ -164,12 +165,25 @@ Status SyncMasterDB::ReadBinlogFileToWq(const std::shared_ptr& slave_ return s; } BinlogItem item; - if (!PikaBinlogTransverter::BinlogItemWithoutContentDecode(TypeFirst, msg, &item)) { - LOG(WARNING) << "Binlog item decode failed"; - return Status::Corruption("Binlog item decode failed"); + cloud::BinlogCloudItem cloud_item; + if (g_pika_conf->pika_mode() == PIKA_CLOUD){ + if (!PikaCloudBinlogTransverter::BinlogItemWithoutContentDecode(msg, &cloud_item)) { + return Status::Corruption("Binlog item decode failed"); + } + } else { + if (!PikaBinlogTransverter::BinlogItemWithoutContentDecode(TypeFirst, msg, &item)) { + LOG(WARNING) << "Binlog item decode failed"; + return Status::Corruption("Binlog item decode failed"); + } } + BinlogOffset sent_b_offset = BinlogOffset(filenum, offset); - LogicOffset sent_l_offset = LogicOffset(item.term_id(), item.logic_id()); + LogicOffset sent_l_offset; + if (g_pika_conf->pika_mode() == PIKA_CLOUD){ + sent_l_offset = LogicOffset(cloud_item.term_id(), 0); + } else { + sent_l_offset = LogicOffset(item.term_id(), item.logic_id()); + } LogOffset sent_offset(sent_b_offset, sent_l_offset); slave_ptr->sync_win.Push(SyncWinItem(sent_offset, msg.size())); @@ -279,6 +293,15 @@ Status SyncMasterDB::GetSafetyPurgeBinlog(std::string* safety_purge) { break; } } +#ifdef USE_S3 + uint32_t oldest_filenum; + s = Logger()->GetOldestBinlogToKeep(&oldest_filenum); + if (!s.ok()) { + LOG(ERROR) << "get oldest binlog to keep failed"; + } + oldest_filenum = oldest_filenum > 0 ? oldest_filenum - 1 : 0; + purge_max = std::min(purge_max, oldest_filenum); +#endif } *safety_purge = (success ? kBinlogPrefix + std::to_string(static_cast(purge_max)) : "none"); return Status::OK(); @@ -770,8 +793,10 @@ Status PikaReplicaManager::CheckDBRole(const std::string& db, int* role) { (sync_master_dbs_[p_info]->GetNumberOfSlaveNode() == 0 && sync_slave_dbs_[p_info]->State() == kNoConnect)) { *role |= PIKA_ROLE_MASTER; + LOG(WARNING) << "role change to PIKA_ROLE_MASTER"; } if (sync_slave_dbs_[p_info]->State() != ReplState::kNoConnect) { + LOG(WARNING) << "role change to PIKA_ROLE_SLAVE"; *role |= PIKA_ROLE_SLAVE; } // if role is not master or slave, the rest situations are all single diff --git a/src/pika_server.cc b/src/pika_server.cc index 57224c5c09..83a452bda6 100644 --- a/src/pika_server.cc +++ b/src/pika_server.cc @@ -3,20 +3,38 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#include #include #include +#include #include #include #include #include #include #include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "net/include/net_cli.h" #include "net/include/net_interfaces.h" #include "net/include/net_stats.h" #include "net/include/redis_cli.h" #include "pstd/include/env.h" #include "pstd/include/rsync.h" +#include "pstd/include/pstd_defer.h" #include "pstd/include/pika_codis_slot.h" #include "include/pika_cmd_table_manager.h" @@ -26,6 +44,10 @@ #include "include/pika_rm.h" #include "include/pika_server.h" +using namespace Aws::Http; +using namespace Aws::Utils; +using namespace Aws::Client; + using pstd::Status; extern PikaServer* g_pika_server; extern std::unique_ptr g_pika_rm; @@ -34,6 +56,51 @@ extern std::unique_ptr g_network_statistic; // QUEUE_SIZE_THRESHOLD_PERCENTAGE is used to represent a percentage value and should be within the range of 0 to 100. const size_t QUEUE_SIZE_THRESHOLD_PERCENTAGE = 75; +namespace { +char * base64Encode(const char *buffer, int length, bool newLine) +{ + BIO *bmem = NULL; + BIO *b64 = NULL; + BUF_MEM *bptr; + + b64 = BIO_new(BIO_f_base64()); + if (!newLine) { + BIO_set_flags(b64, BIO_FLAGS_BASE64_NO_NL); + } + bmem = BIO_new(BIO_s_mem()); + b64 = BIO_push(b64, bmem); + BIO_write(b64, buffer, length); + BIO_flush(b64); + BIO_get_mem_ptr(b64, &bptr); + BIO_set_close(b64, BIO_NOCLOSE); + + char *buff = (char *)malloc(bptr->length + 1); + memcpy(buff, bptr->data, bptr->length); + buff[bptr->length] = 0; + BIO_free_all(b64); + + return buff; +} + +char * base64Decode(char *input, int length, bool newLine) +{ + BIO *b64 = NULL; + BIO *bmem = NULL; + char *buffer = (char *)malloc(length); + memset(buffer, 0, length); + b64 = BIO_new(BIO_f_base64()); + if (!newLine) { + BIO_set_flags(b64, BIO_FLAGS_BASE64_NO_NL); + } + bmem = BIO_new_mem_buf(input, length); + bmem = BIO_push(b64, bmem); + BIO_read(bmem, buffer, length); + BIO_free_all(bmem); + + return buffer; +} +} + void DoPurgeDir(void* arg) { std::unique_ptr path(static_cast(arg)); LOG(INFO) << "Delete dir: " << *path << " start"; @@ -88,6 +155,10 @@ PikaServer::PikaServer() // init role std::string slaveof = g_pika_conf->slaveof(); +#ifdef USE_S3 + storage_options_.sst_cache_size_ = g_pika_conf->SSTCacheSize(); + storage_options_.cloud_fs_options.is_master = true; +#endif if (!slaveof.empty()) { auto sep = static_cast(slaveof.find(':')); std::string master_ip = slaveof.substr(0, sep); @@ -96,6 +167,9 @@ PikaServer::PikaServer() LOG(FATAL) << "you will slaveof yourself as the config file, please check"; } else { SetMaster(master_ip, master_port); +#ifdef USE_S3 + storage_options_.cloud_fs_options.is_master = false; +#endif } } @@ -310,11 +384,17 @@ void PikaServer::InitDBStruct() { std::string log_path = g_pika_conf->log_path(); std::vector db_structs = g_pika_conf->db_structs(); std::lock_guard rwl(dbs_rw_); - for (const auto& db : db_structs) { + for (auto& db : db_structs) { std::string name = db.db_name; std::shared_ptr db_ptr = std::make_shared(name, db_path, log_path); db_ptr->Init(); dbs_.emplace(name, db_ptr); + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + db.cloud_endpoint_override = g_pika_conf->cloud_endpoint_override(); + db.cloud_bucket_prefix = g_pika_conf->cloud_src_bucket_prefix(); + db.cloud_bucket_suffix = g_pika_conf->cloud_src_bucket_prefix(); + db.cloud_bucket_region = g_pika_conf->cloud_src_bucket_region(); + } } } @@ -407,6 +487,9 @@ Status PikaServer::DoSameThingSpecificDB(const std::set& dbs, const case TaskType::kBgSave: db_item.second->BgSaveDB(); break; + case TaskType::kCloudBgSave: + db_item.second->CloudBgSaveDB(); + break; case TaskType::kCompactRangeStrings: db_item.second->CompactRange(storage::DataType::kStrings, arg.argv[0], arg.argv[1]); break; @@ -516,7 +599,9 @@ Status PikaServer::DoSameThingEveryDB(const TaskType& type) { void PikaServer::BecomeMaster() { std::lock_guard l(state_protector_); + int tmp_role = role_; role_ |= PIKA_ROLE_MASTER; + LOG(WARNING) << "role change from " << tmp_role << " to " << role_; } void PikaServer::DeleteSlave(int fd) { @@ -649,10 +734,13 @@ void PikaServer::SyncError() { void PikaServer::RemoveMaster() { { + int tmp_role = role_; std::lock_guard l(state_protector_); repl_state_ = PIKA_REPL_NO_CONNECT; role_ &= ~PIKA_ROLE_SLAVE; + LOG(WARNING) << "removemaster role change from " << tmp_role << " to " << role_; + if (!master_ip_.empty() && master_port_ != -1) { g_pika_rm->CloseReplClientConn(master_ip_, master_port_ + kPortShiftReplServer); g_pika_rm->LostConnection(master_ip_, master_port_); @@ -670,12 +758,14 @@ bool PikaServer::SetMaster(std::string& master_ip, int master_port) { if (master_ip == "127.0.0.1") { master_ip = host_; } + int tmp_role = role_; std::lock_guard l(state_protector_); if (((role_ ^ PIKA_ROLE_SLAVE) != 0) && repl_state_ == PIKA_REPL_NO_CONNECT) { master_ip_ = master_ip; master_port_ = master_port; role_ |= PIKA_ROLE_SLAVE; repl_state_ = PIKA_REPL_SHOULD_META_SYNC; + LOG(WARNING) << "setmaster role change from " << tmp_role << " to " << role_; return true; } return false; @@ -1311,6 +1401,9 @@ void PikaServer::InitStorageOptions() { storage_options_.options.max_background_jobs = g_pika_conf->max_background_jobs(); storage_options_.options.max_open_files = g_pika_conf->max_cache_files(); storage_options_.options.max_bytes_for_level_multiplier = g_pika_conf->max_bytes_for_level_multiplier(); + storage_options_.options.level0_file_num_compaction_trigger = 2; + storage_options_.options.level0_slowdown_writes_trigger = 8; + storage_options_.options.level0_stop_writes_trigger = 16; storage_options_.options.optimize_filters_for_hits = g_pika_conf->optimize_filters_for_hits(); storage_options_.options.level_compaction_dynamic_level_bytes = g_pika_conf->level_compaction_dynamic_level_bytes(); @@ -1377,6 +1470,23 @@ void PikaServer::InitStorageOptions() { // for column-family options storage_options_.options.ttl = g_pika_conf->rocksdb_ttl_second(); storage_options_.options.periodic_compaction_seconds = g_pika_conf->rocksdb_periodic_compaction_second(); + +#ifdef USE_S3 + // rocksdb-cloud + auto& cloud_fs_opts = storage_options_.cloud_fs_options; + storage_options_.options.max_log_file_size = 0; // TODO: better handles of `assert(cloud_manifest)` + cloud_fs_opts.endpoint_override = g_pika_conf->cloud_endpoint_override(); + cloud_fs_opts.credentials.InitializeSimple(g_pika_conf->cloud_access_key(), g_pika_conf->cloud_secret_key()); + if (!cloud_fs_opts.credentials.HasValid().ok()) { + LOG(FATAL) << "Please set the right aws access key and secret key to access s3"; + } + cloud_fs_opts.src_bucket.SetBucketName(g_pika_conf->cloud_src_bucket_suffix(), g_pika_conf->cloud_src_bucket_prefix()); + cloud_fs_opts.src_bucket.SetRegion(g_pika_conf->cloud_src_bucket_region()); + cloud_fs_opts.dest_bucket.SetBucketName(g_pika_conf->cloud_dest_bucket_suffix(), g_pika_conf->cloud_dest_bucket_prefix()); + cloud_fs_opts.dest_bucket.SetRegion(g_pika_conf->cloud_dest_bucket_region()); + //cloud_fs_opts.upload_meta_func = std::bind(&PikaServer::UploadMetaToSentinel, this, + //std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); +#endif } storage::Status PikaServer::RewriteStorageOptions(const storage::OptionType& option_type, @@ -1767,3 +1877,88 @@ void PikaServer::CacheConfigInit(cache::CacheConfig& cache_cfg) { cache_cfg.maxmemory_samples = g_pika_conf->cache_maxmemory_samples(); cache_cfg.lfu_decay_time = g_pika_conf->cache_lfu_decay_time(); } + +#ifdef USE_S3 +bool PikaServer::UploadMetaToSentinel(const std::string& local_path, + const std::string& s3_bucket, + const std::string& remote_path) { + Aws::String url(sentinel_addr_); + if (sentinel_client_ == nullptr) { + sentinel_client_ = CreateHttpClient(Aws::Client::ClientConfiguration()); + } + + FILE* fp = fopen(local_path.c_str(), "rb"); + if (fp == nullptr) { + LOG(WARNING) << "read file failed," + << " local_path: " << local_path + << " error: " << strerror(errno); + return false; + } + + fseek(fp, 0 , SEEK_END); + long f_size = ftell(fp); + rewind(fp); + char* buffer = new char[f_size]; + + DEFER { + delete [] buffer; + fclose(fp); + }; + + size_t result = fread(buffer, 1, f_size, fp); + if (result != f_size) { + LOG(WARNING) << "read file failed, local_path: " << local_path + << " fread size: " << result << "fsize: " << f_size; + } + char* base64_enc = base64Encode(buffer, result, false); + std::string content(base64_enc, strlen(base64_enc)); + LOG(WARNING) << "raw data size: " << result << " encode size: " << strlen(base64_enc) << " enc str: " << base64_enc; + free(base64_enc); + + // construct request body + Json::JsonValue request_doc; + request_doc.WithInteger("term_id", lease_term_id_); + request_doc.WithInteger("group_id", group_id_); + request_doc.WithString("s3_bucket", Aws::String(s3_bucket)); + request_doc.WithString("s3_path", Aws::String(remote_path)); + request_doc.WithString("content", Aws::String(content)); + + std::shared_ptr body = Aws::MakeShared(""); + *body << request_doc.View().WriteReadable(); + + auto request = CreateHttpRequest(url, HttpMethod::HTTP_POST, + Aws::Utils::Stream::DefaultResponseStreamFactoryMethod); + request->AddContentBody(body); + body->seekg(0, body->end); + auto streamSize = body->tellg(); + body->seekg(0, body->beg); + Aws::StringStream contentLength; + contentLength << streamSize; + request->SetContentLength(contentLength.str()); + request->SetContentType("application/json"); + + auto response = sentinel_client_->MakeRequest(request); + if (response->HasClientError()) { + LOG(ERROR) << "UploadMetaToSentinel failed" + << " s3_bucket: " << s3_bucket + << " group_id: " << group_id_ + << " remote path: " << remote_path; + return false; + + } + if (response->GetResponseCode() == HttpResponseCode::OK) { + LOG(ERROR) << "UploadMetaToSentinel success" + << " s3_bucket: " << s3_bucket + << " group_id: " << group_id_ + << " remote path: " << remote_path; + return true; + } + + LOG(ERROR) << "UploadMetaToSentinel failed " + << " s3_bucket: " << s3_bucket + << " group_id: " << group_id_ + << " remote path: " << remote_path; + return false; +} +#endif + diff --git a/src/pika_stable_log.cc b/src/pika_stable_log.cc index ba51d9171c..6be340643a 100644 --- a/src/pika_stable_log.cc +++ b/src/pika_stable_log.cc @@ -8,11 +8,13 @@ #include +#include "include/pika_cloud_binlog.h" +#include "include/pika_cloud_binlog_transverter.h" +#include "include/pika_conf.h" #include "include/pika_rm.h" #include "include/pika_server.h" #include "include/pika_stable_log.h" #include "pstd/include/env.h" -#include "include/pika_conf.h" using pstd::Status; @@ -21,7 +23,11 @@ extern std::unique_ptr g_pika_rm; StableLog::StableLog(std::string db_name, std::string log_path) : purging_(false), db_name_(std::move(db_name)), log_path_(std::move(log_path)) { - stable_logger_ = std::make_shared(log_path_, g_pika_conf->binlog_file_size()); + if (g_pika_conf->pika_mode() == PIKA_LOCAL) { + stable_logger_ = std::make_shared(log_path_, g_pika_conf->binlog_file_size()); + } else if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + stable_logger_ = std::make_shared(log_path_, g_pika_conf->binlog_file_size()); + } std::map binlogs; if (!GetBinlogFiles(&binlogs)) { LOG(FATAL) << log_path_ << " Could not get binlog files!"; @@ -171,6 +177,7 @@ void StableLog::UpdateFirstOffset(uint32_t filenum) { BinlogItem item; BinlogOffset offset; + cloud::BinlogCloudItem cloud_item; while (true) { std::string binlog; Status s = binlog_reader.Get(&binlog, &(offset.filenum), &(offset.offset)); @@ -181,20 +188,35 @@ void StableLog::UpdateFirstOffset(uint32_t filenum) { LOG(WARNING) << "Binlog reader get failed"; return; } - if (!PikaBinlogTransverter::BinlogItemWithoutContentDecode(TypeFirst, binlog, &item)) { - LOG(WARNING) << "Binlog item decode failed"; - return; - } - // exec_time == 0, could be padding binlog - if (item.exec_time() != 0) { - break; + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + if (!PikaCloudBinlogTransverter::BinlogItemWithoutContentDecode(binlog, &cloud_item)) { + LOG(WARNING) << "Cloud Binlog item decode failed"; + return; + } + // exec_time == 0, could be padding cloudbinlog + if (cloud_item.exec_time() != 0) { + break; + } + } else { + if (!PikaBinlogTransverter::BinlogItemWithoutContentDecode(TypeFirst, binlog, &item)) { + LOG(WARNING) << "Binlog item decode failed"; + return; + } + // exec_time == 0, could be padding binlog + if (item.exec_time() != 0) { + break; + } } } std::lock_guard l(offset_rwlock_); first_offset_.b_offset = offset; - first_offset_.l_offset.term = item.term_id(); - first_offset_.l_offset.index = item.logic_id(); + if (g_pika_conf->pika_mode() == PIKA_CLOUD) { + first_offset_.l_offset.term = cloud_item.term_id(); + } else { + first_offset_.l_offset.term = item.term_id(); + first_offset_.l_offset.index = item.logic_id(); + } } Status StableLog::PurgeFileAfter(uint32_t filenum) { diff --git a/src/pstd/include/pstd_wal.h b/src/pstd/include/pstd_wal.h new file mode 100644 index 0000000000..9ece9ffa7d --- /dev/null +++ b/src/pstd/include/pstd_wal.h @@ -0,0 +1,22 @@ +// Copyright (c) 2024-present, Qihoo, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef __PSTD_WAL_H__ +#define __PSTD_WAL_H__ + +#include "pstd/include/pstd_status.h" +#include "pstd/include/noncopyable.h" + +namespace pstd { + +// virutal base class for wal writer +class WalWriter : public noncopyable { +public: + virtual ~WalWriter() {} + virtual Status Put(const std::string& item, uint32_t db_id, uint32_t rocksdb_id, uint32_t type) = 0; +}; +} // namespace pstd + +#endif // __PSTD_WAL_H__ diff --git a/src/storage/CMakeLists.txt b/src/storage/CMakeLists.txt index e12cae9b7d..35db197c09 100644 --- a/src/storage/CMakeLists.txt +++ b/src/storage/CMakeLists.txt @@ -4,7 +4,7 @@ set (CMAKE_CXX_STANDARD 17) project (storage) # Other CMake modules -add_subdirectory(tests) +#add_subdirectory(tests) # add_subdirectory(examples) # add_subdirectory(benchmark) diff --git a/src/storage/include/storage/backupable.h b/src/storage/include/storage/backupable.h index e190993c29..d6af6994a2 100644 --- a/src/storage/include/storage/backupable.h +++ b/src/storage/include/storage/backupable.h @@ -11,6 +11,7 @@ #include "rocksdb/db.h" #include "db_checkpoint.h" +#include "include/pika_conf.h" #include "storage.h" #include "util.h" @@ -49,6 +50,9 @@ class BackupEngine { Status CreateNewBackup(const std::string& dir); + Status CreateNewCloudBackup(rocksdb::CloudFileSystemOptions& cloud_fs_options, + PikaConf* pika_conf) ; + void StopBackup(); Status CreateNewBackupSpecify(const std::string& dir, int index); diff --git a/src/storage/include/storage/storage.h b/src/storage/include/storage/storage.h index 296882f510..7fee31d6ee 100644 --- a/src/storage/include/storage/storage.h +++ b/src/storage/include/storage/storage.h @@ -21,6 +21,11 @@ #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#ifdef USE_S3 +#include "rocksdb/cloud/db_cloud.h" +#include "rocksdb/cloud/cloud_file_system.h" +#include "pstd/include/pstd_wal.h" +#endif #include "slot_indexer.h" #include "pstd/include/pstd_mutex.h" @@ -70,6 +75,10 @@ struct StorageOptions { size_t statistics_max_size = 0; size_t small_compaction_threshold = 5000; size_t small_compaction_duration_threshold = 10000; +#ifdef USE_S3 + rocksdb::CloudFileSystemOptions cloud_fs_options; // rocksdb-cloud option + int64_t sst_cache_size_ = 10LL << 30; +#endif Status ResetOptions(const OptionType& option_type, const std::unordered_map& options_map); }; @@ -181,7 +190,9 @@ class Storage { Storage(int db_instance_num, int slot_num, bool is_classic_mode); ~Storage(); - Status Open(const StorageOptions& storage_options, const std::string& db_path); + Status Open(const StorageOptions& storage_options, const std::string& db_path, std::shared_ptr wal_writer = nullptr); + + Status FlushDB(); Status LoadCursorStartKey(const DataType& dtype, int64_t cursor, char* type, std::string* start_key); @@ -191,6 +202,12 @@ class Storage { std::unique_ptr& GetDBInstance(const std::string& key); + Status ApplyWAL(int rocksdb_id, int type, const std::string& content, + std::unordered_set* redis_keys); + + bool ShouldSkip(int rocksdb_id, const std::string& content); + Status FlushDBAtSlave(int rocksdb_id); + // Strings Commands // Set key to hold the string value. if key @@ -1095,12 +1112,20 @@ class Storage { Status GetKeyNum(std::vector* key_infos); Status StopScanKeyNum(); +#ifdef USE_S3 + rocksdb::DBCloud* GetDBByIndex(int index); + + // called when switch master-slave + Status SwitchMaster(bool is_old_master, bool is_new_master); + +#else rocksdb::DB* GetDBByIndex(int index); +#endif Status SetOptions(const OptionType& option_type, const std::string& db_type, const std::unordered_map& options); void SetCompactRangeOptions(const bool is_canceled); - Status EnableDymayticOptions(const OptionType& option_type, + Status EnableDymayticOptions(const OptionType& option_type, const std::string& db_type, const std::unordered_map& options); Status EnableAutoCompaction(const OptionType& option_type, const std::string& db_type, const std::unordered_map& options); diff --git a/src/storage/include/storage/storage_define.h b/src/storage/include/storage/storage_define.h index 7dbd614169..4e97c20b91 100644 --- a/src/storage/include/storage/storage_define.h +++ b/src/storage/include/storage/storage_define.h @@ -129,5 +129,17 @@ inline const char* SeekUserkeyDelim(const char* ptr, int length) { return ptr; } +#ifdef USE_S3 + // this enum is an extention of ReplicationLogRecord's Type + // reserves kMemtableWrite, kMemtableSwitch, kManifestWrite, + // add kFlushDB which indicates a pika's flushdb call. + enum class RocksDBRecordType : uint32_t { + kMemtableWrite, + kMemtableSwitch, + kManifestWrite, + kFlushDB = 10, // newly add enum, specity pika's flushdb operation + }; +#endif + } // end namespace storage #endif diff --git a/src/storage/src/backupable.cc b/src/storage/src/backupable.cc index 4acd8dee72..9691cf59de 100644 --- a/src/storage/src/backupable.cc +++ b/src/storage/src/backupable.cc @@ -6,9 +6,12 @@ #include #include +#include "include/pika_conf.h" #include "storage/backupable.h" #include "storage/storage.h" +extern const std::string kRegion; + namespace storage { BackupEngine::~BackupEngine() { @@ -141,6 +144,71 @@ Status BackupEngine::CreateNewBackup(const std::string& dir) { return s; } +std::string GenBackUpDirectory(std::string& db_path) { + // dbpath :1."db/"、2."db"、3."bak/db" + size_t lastSepPos = db_path.rfind('/'); + if (lastSepPos != std::string::npos) { + if (db_path.back() == '/') { + db_path.replace(lastSepPos, std::string::npos, "_bak/"); + return db_path; + } else { + return db_path + "_bak/"; + } + } else { + return db_path.append("_bak/"); + } +} + +std::string DBPath(const std::string& path, const std::string& db_name) { + char buf[100]; + snprintf(buf, sizeof(buf), "%s/", db_name.data()); + return path + buf; +} + +Status BackupEngine::CreateNewCloudBackup(rocksdb::CloudFileSystemOptions& cloud_fs_options, + PikaConf* pika_conf) { + Status s = Status::OK(); + + std::string src_bucket = cloud_fs_options.src_bucket.GetBucketName(); + cloud_fs_options.src_bucket.SetBucketName(src_bucket + ".backup"); + cloud_fs_options.dest_bucket.SetBucketName(src_bucket + ".backup"); + std::string db_path_tmp = pika_conf->db_path(); + std::string clone_path = GenBackUpDirectory(db_path_tmp); + + //todo: At present, this operation is not supported online. + // It will be modified according to deployment in the future + for (auto& db_pika : pika_conf->db_structs()) { + std::string db_path = DBPath(pika_conf->db_path(), db_pika.db_name); + std::string clone_db_path = DBPath(clone_path, db_pika.db_name); + for (int i = 0; i < db_pika.db_instance_num; i++) { + rocksdb::CloudFileSystem* cfs; + s = rocksdb::CloudFileSystem::NewAwsFileSystem( + rocksdb::FileSystem::Default(), src_bucket, + db_path + std::to_string(i), kRegion, + src_bucket + ".backup",clone_db_path + std::to_string(i), + kRegion, cloud_fs_options, nullptr, &cfs); + + if (!s.ok()) { + return s; + } + std::shared_ptr fs(cfs); + auto cloud_env = NewCompositeEnv(fs); + Options options; + options.env = cloud_env.get(); + // open clone + rocksdb::DBCloud* db; + s = rocksdb::DBCloud::Open(options, clone_db_path + std::to_string(i), + "", 0, &db); + if (!s.ok()) { + return s; + } + db->Savepoint(); + } + } + return s; +} + + void BackupEngine::StopBackup() { // DEPRECATED } diff --git a/src/storage/src/base_filter.h b/src/storage/src/base_filter.h index 093f3f4761..3df7d7e3d8 100644 --- a/src/storage/src/base_filter.h +++ b/src/storage/src/base_filter.h @@ -15,6 +15,11 @@ #include "src/base_data_key_format.h" #include "src/base_meta_value_format.h" #include "src/debug.h" +#ifdef USE_S3 +#include "rocksdb/cloud/db_cloud.h" +#else +#include "rocksdb/db.h" +#endif namespace storage { @@ -60,7 +65,11 @@ class BaseMetaFilterFactory : public rocksdb::CompactionFilterFactory { class BaseDataFilter : public rocksdb::CompactionFilter { public: +#ifdef USE_S3 + BaseDataFilter(rocksdb::DBCloud* db, std::vector* cf_handles_ptr, int meta_cf_index) +#else BaseDataFilter(rocksdb::DB* db, std::vector* cf_handles_ptr, int meta_cf_index) +#endif : db_(db), cf_handles_ptr_(cf_handles_ptr), meta_cf_index_(meta_cf_index) @@ -146,7 +155,11 @@ class BaseDataFilter : public rocksdb::CompactionFilter { const char* Name() const override { return "BaseDataFilter"; } private: +#ifdef USE_S3 + rocksdb::DBCloud* db_ = nullptr; +#else rocksdb::DB* db_ = nullptr; +#endif std::vector* cf_handles_ptr_ = nullptr; rocksdb::ReadOptions default_read_options_; mutable std::string cur_key_; @@ -158,7 +171,11 @@ class BaseDataFilter : public rocksdb::CompactionFilter { class BaseDataFilterFactory : public rocksdb::CompactionFilterFactory { public: +#ifdef USE_S3 + BaseDataFilterFactory(rocksdb::DBCloud** db_ptr, std::vector* handles_ptr, int meta_cf_index) +#else BaseDataFilterFactory(rocksdb::DB** db_ptr, std::vector* handles_ptr, int meta_cf_index) +#endif : db_ptr_(db_ptr), cf_handles_ptr_(handles_ptr), meta_cf_index_(meta_cf_index) {} std::unique_ptr CreateCompactionFilter( const rocksdb::CompactionFilter::Context& context) override { @@ -167,7 +184,11 @@ class BaseDataFilterFactory : public rocksdb::CompactionFilterFactory { const char* Name() const override { return "BaseDataFilterFactory"; } private: +#ifdef USE_S3 + rocksdb::DBCloud** db_ptr_ = nullptr; +#else rocksdb::DB** db_ptr_ = nullptr; +#endif std::vector* cf_handles_ptr_ = nullptr; int meta_cf_index_ = 0; }; diff --git a/src/storage/src/lists_filter.h b/src/storage/src/lists_filter.h index b31b01c441..1e6ce8a607 100644 --- a/src/storage/src/lists_filter.h +++ b/src/storage/src/lists_filter.h @@ -10,9 +10,7 @@ #include #include -#include "rocksdb/compaction_filter.h" -#include "rocksdb/db.h" -#include "src/debug.h" +#include "base_filter.h" #include "src/lists_data_key_format.h" #include "src/lists_meta_value_format.h" @@ -60,7 +58,11 @@ class ListsMetaFilterFactory : public rocksdb::CompactionFilterFactory { class ListsDataFilter : public rocksdb::CompactionFilter { public: +#ifdef USE_S3 + ListsDataFilter(rocksdb::DBCloud* db, std::vector* cf_handles_ptr, int meta_cf_index) +#else ListsDataFilter(rocksdb::DB* db, std::vector* cf_handles_ptr, int meta_cf_index) +#endif : db_(db), cf_handles_ptr_(cf_handles_ptr), meta_cf_index_(meta_cf_index) @@ -145,7 +147,11 @@ class ListsDataFilter : public rocksdb::CompactionFilter { const char* Name() const override { return "ListsDataFilter"; } private: +#ifdef USE_S3 + rocksdb::DBCloud* db_ = nullptr; +#else rocksdb::DB* db_ = nullptr; +#endif std::vector* cf_handles_ptr_ = nullptr; rocksdb::ReadOptions default_read_options_; mutable std::string cur_key_; @@ -157,7 +163,11 @@ class ListsDataFilter : public rocksdb::CompactionFilter { class ListsDataFilterFactory : public rocksdb::CompactionFilterFactory { public: +#ifdef USE_S3 + ListsDataFilterFactory(rocksdb::DBCloud** db_ptr, std::vector* handles_ptr, int meta_cf_index) +#else ListsDataFilterFactory(rocksdb::DB** db_ptr, std::vector* handles_ptr, int meta_cf_index) +#endif : db_ptr_(db_ptr), cf_handles_ptr_(handles_ptr), meta_cf_index_(meta_cf_index) {} std::unique_ptr CreateCompactionFilter( @@ -167,7 +177,11 @@ class ListsDataFilterFactory : public rocksdb::CompactionFilterFactory { const char* Name() const override { return "ListsDataFilterFactory"; } private: +#ifdef USE_S3 + rocksdb::DBCloud** db_ptr_ = nullptr; +#else rocksdb::DB** db_ptr_ = nullptr; +#endif std::vector* cf_handles_ptr_ = nullptr; int meta_cf_index_ = 0; }; diff --git a/src/storage/src/redis.cc b/src/storage/src/redis.cc index 94c85ecbca..bf64ef9aff 100644 --- a/src/storage/src/redis.cc +++ b/src/storage/src/redis.cc @@ -6,13 +6,19 @@ #include #include "rocksdb/env.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "cloud/filename.h" #include "src/redis.h" +#include "rocksdb/options.h" #include "src/strings_filter.h" #include "src/lists_filter.h" #include "src/base_filter.h" #include "src/zsets_filter.h" +#include "pstd/include/pstd_defer.h" + namespace storage { const rocksdb::Comparator* ListsDataKeyComparator() { static ListsDataKeyComparatorImpl ldkc; @@ -24,23 +30,32 @@ rocksdb::Comparator* ZSetsScoreKeyComparator() { return &zsets_score_key_compare; } -Redis::Redis(Storage* const s, int32_t index) +Redis::Redis(Storage* const s, int32_t index, std::shared_ptr wal_writer) : storage_(s), index_(index), lock_mgr_(std::make_shared(1000, 0, std::make_shared())), small_compaction_threshold_(5000), - small_compaction_duration_threshold_(10000) { + small_compaction_duration_threshold_(10000), + wal_writer_(wal_writer) { statistics_store_ = std::make_unique>(); scan_cursors_store_ = std::make_unique>(); spop_counts_store_ = std::make_unique>(); default_compact_range_options_.exclusive_manual_compaction = false; default_compact_range_options_.change_level = true; + default_write_options_.disableWAL = true; spop_counts_store_->SetCapacity(1000); scan_cursors_store_->SetCapacity(5000); //env_ = rocksdb::Env::Instance(); +#ifdef USE_S3 + log_listener_ = std::make_shared(index_, this, wal_writer); +#endif handles_.clear(); } Redis::~Redis() { + Close(); +} + +void Redis::Close() { rocksdb::CancelAllBackgroundWork(db_, true); std::vector tmp_handles = handles_; handles_.clear(); @@ -53,17 +68,82 @@ Redis::~Redis() { if (default_compact_range_options_.canceled) { delete default_compact_range_options_.canceled; } +#ifdef USE_S3 + opened_ = false; +#endif } -Status Redis::Open(const StorageOptions& storage_options, const std::string& db_path) { +Status Redis::FlushDBAtSlave() { + Close(); + pstd::DeleteDir(db_path_); + return Open(storage_options_, db_path_); +} + +Status Redis::FlushDB() { + rocksdb::CancelAllBackgroundWork(db_, true); + std::string s3_bucket = storage_options_.cloud_fs_options.dest_bucket.GetBucketName(); + std::string local_dbid; + auto s = ReadFileToString(cfs_->GetBaseFileSystem().get(), rocksdb::IdentityFileName(db_path_), &local_dbid); + LOG(INFO) << "local_dbid: " << local_dbid << " status: " << s.ToString(); + if (!s.ok()) { + return s; + } + s = cfs_->DeleteDbid(s3_bucket, local_dbid); + LOG(INFO) << " deletedbid status: " << s.ToString(); + if (!s.ok()) { + return s; + } + s = cfs_->DeleteCloudObject(s3_bucket, MakeCloudManifestFile(db_path_, "")); + LOG(INFO) << "deletecloudmanifestfromdest tatus: " << s.ToString(); + if (!s.ok()) { + return s; + } + s = cfs_->DeleteCloudObject(s3_bucket, rocksdb::IdentityFileName(db_path_)); + LOG(INFO) << "deleteidentityfile status: " << s.ToString(); + if (!s.ok()) { + return s; + } + cfs_->SwitchMaster(false); + Close(); + pstd::DeleteDir(db_path_); + Open(storage_options_, db_path_); + wal_writer_->Put("flushdb", 0/*db_id*/, index_, static_cast(RocksDBRecordType::kFlushDB)); + return s; +} + +Status Redis::Open(const StorageOptions& tmp_storage_options, const std::string& db_path) { + + StorageOptions storage_options(tmp_storage_options); +#ifdef USE_S3 + db_path_ = db_path; + storage_options_ = tmp_storage_options; + storage_options_.cloud_fs_options.dest_bucket.SetObjectPath(db_path_); + storage_options_.cloud_fs_options.src_bucket.SetObjectPath(db_path_); + storage_options.cloud_fs_options.roll_cloud_manifest_on_open = true; + storage_options.cloud_fs_options.resync_on_open = true; + storage_options.cloud_fs_options.resync_manifest_on_open = true; + storage_options.cloud_fs_options.skip_dbid_verification = true; + storage_options.cloud_fs_options.sst_file_cache = rocksdb::NewLRUCache(storage_options_.sst_cache_size_, 0/*num_shard_bits*/); + storage_options.options.replication_log_listener = log_listener_; + + is_master_.store(tmp_storage_options.cloud_fs_options.is_master); + if (!tmp_storage_options.cloud_fs_options.is_master) { + storage_options.options.disable_auto_flush = true; + storage_options.options.disable_auto_compactions = true; + } + storage_options.options.atomic_flush = true; + storage_options.options.avoid_flush_during_shutdown = true; +#endif + statistics_store_->SetCapacity(storage_options.statistics_max_size); small_compaction_threshold_ = storage_options.small_compaction_threshold; rocksdb::BlockBasedTableOptions table_ops(storage_options.table_options); table_ops.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); - rocksdb::DBOptions db_ops(storage_options.options); + rocksdb::Options db_ops(storage_options.options); db_ops.create_missing_column_families = true; + db_ops.listeners.emplace_back(new RocksDBEventListener(index_)); // db_ops.env = env_; // string column-family options @@ -175,7 +255,25 @@ Status Redis::Open(const StorageOptions& storage_options, const std::string& db_ // stream CF column_families.emplace_back("stream_meta_cf", stream_meta_cf_ops); column_families.emplace_back("stream_data_cf", stream_data_cf_ops); - return rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + +#ifdef USE_S3 + Status s = OpenCloudEnv(storage_options.cloud_fs_options, db_path); + if (!s.ok()) { + LOG(ERROR) << "Failed to create AWS S3 cloud environment"; + return s; + } + db_ops.env = cloud_env_.get(); + s = rocksdb::DBCloud::Open(db_ops, db_path, column_families, "", 0, &handles_, &db_); + if (s.ok()) { + opened_ = true; + } + return s; + +#else + auto s = rocksdb::DB::Open(db_ops, db_path, column_families, &handles_, &db_); + opened_ = true; + return s; +#endif } Status Redis::GetScanStartPoint(const DataType& type, const Slice& key, const Slice& pattern, int64_t cursor, std::string* start_point) { @@ -414,7 +512,7 @@ void Redis::SetCompactRangeOptions(const bool is_canceled) { default_compact_range_options_.canceled = new std::atomic(is_canceled); } else { default_compact_range_options_.canceled->store(is_canceled); - } + } } Status Redis::GetProperty(const std::string& property, uint64_t* out) { @@ -465,4 +563,220 @@ void Redis::ScanDatabase() { ScanSets(); } +#ifdef USE_S3 +Status Redis::OpenCloudEnv(rocksdb::CloudFileSystemOptions opts, const std::string& db_path) { + std::string s3_path = db_path[0] == '.' ? db_path.substr(1) : db_path; + opts.src_bucket.SetObjectPath(s3_path); + opts.dest_bucket.SetObjectPath(s3_path); + Status s = rocksdb::CloudFileSystem::NewAwsFileSystem( + rocksdb::FileSystem::Default(), + opts, + nullptr, + &cfs_ + ); + if (s.ok()) { + std::shared_ptr cloud_fs(cfs_); + cloud_env_ = NewCompositeEnv(cloud_fs); + } + return s; +} + +Status Redis::ReOpenRocksDB(const storage::StorageOptions& opt) { + Close(); + Open(opt, db_path_); + return Status::OK(); +} + +Status Redis::SwitchMaster(bool is_old_master, bool is_new_master) { + LOG(WARNING) << "switchMaster from " << (is_old_master ? "master" : "slave") + << " to " << (is_new_master ? "master" : "slave"); + if (is_old_master && is_new_master) { + // Do nothing + return Status::OK(); + } + + storage::StorageOptions storage_options(storage_options_); + std::unordered_map db_options; + if (is_old_master && !is_new_master) { + cfs_->SwitchMaster(false); + storage_options.cloud_fs_options.is_master = false; + is_master_.store(false); + return ReOpenRocksDB(storage_options); + } + + // slaveof another pika master, just reopen + if (!is_old_master && !is_new_master) { + storage_options.cloud_fs_options.is_master = false; + is_master_.store(false); + return ReOpenRocksDB(storage_options); + } + + // slave promotes to master + if (!is_old_master && is_new_master) { + storage_options.cloud_fs_options.is_master = true; + db_options["disable_auto_compactions"] = "false"; + db_options["disable_auto_flush"] = "false"; + // compare manifest_sequence + uint64_t local_manifest_sequence = 0; + auto s = db_->GetManifestUpdateSequence(&local_manifest_sequence); + if (!s.ok()) { + LOG(ERROR) << "get manifestupdatesequence error: " << s.ToString(); + } + uint64_t remote_manifest_sequence = 0; + cfs_->GetMaxManifestSequenceFromCurrentManifest(db_->GetName(), &remote_manifest_sequence); + // local version behind remote, directly reopen + if (local_manifest_sequence < remote_manifest_sequence) { + return ReOpenRocksDB(storage_options); + } + // local's version cannot beyond remote's, just holding extra data in memtables + assert(local_manifest_sequence == remote_manifest_sequence); + storage_options_.cloud_fs_options.is_master = true; + is_master_.store(true); + + db_->NewManifestOnNextUpdate(); + cfs_->SwitchMaster(true); + for (const auto& cf : handles_) { + db_->SetOptions(cf, db_options); + } + + rocksdb::FlushOptions fops; + fops.wait = true; + db_->Flush(fops, handles_); + return Status::OK(); + } + return Status::OK(); +} + +bool Redis::ShouldSkip(const std::string& content) { + rocksdb::WriteBatch batch; + auto s = rocksdb::WriteBatchInternal::SetContents(&batch, content); + auto sq_number = db_->GetLatestSequenceNumber(); + return rocksdb::WriteBatchInternal::Sequence(&batch) != sq_number + 1; +} + +class WriteBatchHandler : public rocksdb::WriteBatch::Handler { +public: + WriteBatchHandler(std::unordered_set* redis_keys) + : redis_keys_(redis_keys) {} + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + return DeleteCF(column_family_id, key); + } + + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + switch (column_family_id) { + case kStringsCF: { + ParsedBaseKey pbk(key); + redis_keys_->insert("K" + pbk.Key().ToString()); + break; + } + case kHashesMetaCF: { + ParsedBaseMetaKey pbk(key); + redis_keys_->insert("H" + pbk.Key().ToString()); + break; + } + case kHashesDataCF: { + ParsedHashesDataKey pbk(key); + redis_keys_->insert("H" + pbk.Key().ToString()); + break; + } + case kSetsMetaCF: { + ParsedBaseMetaKey pbk(key); + redis_keys_->insert("S" + pbk.Key().ToString()); + break; + } + case kSetsDataCF: { + ParsedSetsMemberKey pbk(key); + redis_keys_->insert("S" + pbk.Key().ToString()); + break; + } + case kListsMetaCF: { + ParsedBaseMetaKey pbk(key); + redis_keys_->insert("L" + pbk.Key().ToString()); + break; + } + case kListsDataCF: { + ParsedListsDataKey pbk(key); + redis_keys_->insert("L" + pbk.key().ToString()); + break; + } + case kZsetsMetaCF: { + ParsedBaseMetaKey pbk(key); + redis_keys_->insert("Z" + pbk.Key().ToString()); + break; + } + case kZsetsDataCF: { + ParsedZSetsMemberKey pbk(key); + redis_keys_->insert("Z" + pbk.Key().ToString()); + break; + } + case kZsetsScoreCF: { + ParsedZSetsScoreKey pbk(key); + redis_keys_->insert("Z" + pbk.key().ToString()); + break; + } + case kStreamsMetaCF: { + LOG(INFO) << "rediscache don't cache stream type"; + break; + } + case kStreamsDataCF: { + LOG(INFO) << "rediscache don't cache stream type"; + break; + } + } + return Status::OK(); + } +private: + std::unordered_set* redis_keys_ = nullptr; +}; + +Status Redis::ApplyWAL(int type, const std::string& content, + std::unordered_set* redis_keys) { + rocksdb::ReplicationLogRecord::Type rtype = static_cast(type); + rocksdb::ReplicationLogRecord rlr; + rocksdb::DBCloud::ApplyReplicationLogRecordInfo info; + rlr.contents = content; + rlr.type = rtype; + + auto s = db_->ApplyReplicationLogRecord(rlr, "", nullptr, true, &info, rocksdb::DB::AR_EVICT_OBSOLETE_FILES); + if (!s.ok()) { + return s; + } + if (type != 0) { + return s; + } + + rocksdb::WriteBatch batch; + s = rocksdb::WriteBatchInternal::SetContents(&batch, content); + WriteBatchHandler handler(redis_keys); + s = batch.Iterate(&handler); + return s; +} + +std::string LogListener::OnReplicationLogRecord(rocksdb::ReplicationLogRecord record) { + Redis* redis_inst = (Redis*)inst_; + //TODO(wangshaoyi): get from storage + int db_id = 0; + if (!redis_inst->opened_) { + LOG(WARNING) << "rocksdb not opened yet, skip write binlog"; + return "0"; + } + + if (!redis_inst->IsMaster()) { + return "0"; + } + if (record.type != rocksdb::ReplicationLogRecord::kMemtableWrite) { + redis_inst->cfs_->WaitPendingObjects(); + } + + auto s = wal_writer_->Put(record.contents, db_id, + redis_inst->GetIndex(), record.type); + if (!s.ok()) { + LOG(ERROR) << "write binlog failed, db_id: " << db_id + << " rocksdb_id: " << redis_inst->GetIndex(); + } + return ""; +} +#endif } // namespace storage diff --git a/src/storage/src/redis.h b/src/storage/src/redis.h index 2e28743aae..43f2ec6a45 100644 --- a/src/storage/src/redis.h +++ b/src/storage/src/redis.h @@ -10,7 +10,13 @@ #include #include +#ifdef USE_S3 +#include "rocksdb/cloud/db_cloud.h" +#include "pstd/include/pstd_wal.h" +#else #include "rocksdb/db.h" +#include "rocksdb/listener.h" +#endif #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -29,16 +35,34 @@ #define SPOP_COMPACT_THRESHOLD_COUNT 500 #define SPOP_COMPACT_THRESHOLD_DURATION (1000 * 1000) // 1000ms +static inline std::string StallEnumToString(rocksdb::WriteStallCondition cond) { + switch (cond) { + case rocksdb::WriteStallCondition::kDelayed: + return "delayed"; + case rocksdb::WriteStallCondition::kStopped: + return "stopped"; + case rocksdb::WriteStallCondition::kNormal: + return "normal"; + } +} + namespace storage { using Status = rocksdb::Status; using Slice = rocksdb::Slice; +class LogListener; class Redis { public: - Redis(Storage* storage, int32_t index); + friend class LogListener; + Redis(Storage* storage, int32_t index, std::shared_ptr wal_writer = nullptr); virtual ~Redis(); +#ifdef USE_S3 + rocksdb::DBCloud* GetDB() { return db_; } + bool IsMaster() const { return is_master_.load(); } +#else rocksdb::DB* GetDB() { return db_; } +#endif struct KeyStatistics { size_t window_size; @@ -103,6 +127,8 @@ class Redis { // Common Commands Status Open(const StorageOptions& storage_options, const std::string& db_path); + void Close(); + Status FlushDB(); virtual Status CompactRange(const DataType& option_type, const rocksdb::Slice* begin, const rocksdb::Slice* end, const ColumnFamilyType& type = kMetaAndData); @@ -383,6 +409,17 @@ class Redis { return nullptr; } +#ifdef USE_S3 + Status ApplyWAL(int type, const std::string& content, + std::unordered_set* redis_keys); + bool ShouldSkip(const std::string& content); + Status FlushDBAtSlave(); + Status SwitchMaster(bool is_old_master, bool is_new_master); + void ResetLogListener(std::shared_ptr handle) { + log_listener_ = handle; + } +#endif + private: Status GenerateStreamID(const StreamMetaValue& stream_meta, StreamAddTrimArgs& args); @@ -415,12 +452,23 @@ class Redis { inline Status SetFirstOrLastID(const rocksdb::Slice& key, StreamMetaValue& stream_meta, bool is_set_first, rocksdb::ReadOptions& read_options); +public: + bool opened_ = false; private: int32_t index_ = 0; Storage* const storage_; std::shared_ptr lock_mgr_; +#ifdef USE_S3 + std::string db_path_; + rocksdb::DBCloud* db_ = nullptr; + std::shared_ptr log_listener_; + std::shared_ptr wal_writer_; + StorageOptions storage_options_; + std::atomic is_master_ = {true}; +#else rocksdb::DB* db_ = nullptr; +#endif //TODO(wangshaoyi): seperate env for each rocksdb instance // rocksdb::Env* env_ = nullptr; @@ -444,6 +492,66 @@ class Redis { Status UpdateSpecificKeyStatistics(const DataType& dtype, const std::string& key, uint64_t count); Status UpdateSpecificKeyDuration(const DataType& dtype, const std::string& key, uint64_t duration); Status AddCompactKeyTaskIfNeeded(const DataType& dtype, const std::string& key, uint64_t count, uint64_t duration); + +#ifdef USE_S3 + // rocksdb-cloud + Status OpenCloudEnv(rocksdb::CloudFileSystemOptions opts, const std::string& db_path); + std::unique_ptr cloud_env_; + rocksdb::CloudFileSystem* cfs_; + Status ReOpenRocksDB(const storage::StorageOptions& opt); +#endif +}; + +// TODO(wangshaoyi): implement details +class LogListener : public rocksdb::ReplicationLogListener { +public: + LogListener(int rocksdb_id, void* inst, std::shared_ptr wal_writer) + : rocksdb_id_(rocksdb_id), counter_(0), + inst_(inst), wal_writer_(wal_writer) {} + std::string OnReplicationLogRecord(rocksdb::ReplicationLogRecord record) override; + + // reset when switch master or process start + void ResetSequence(uint64_t seq) { + counter_.store(seq); + } +private: + int rocksdb_id_ = 0; + std::atomic counter_ = {0}; + void* inst_ = nullptr; + std::shared_ptr wal_writer_ = nullptr; +}; + +class RocksDBEventListener : public rocksdb::EventListener { +public: + RocksDBEventListener(int index) : index_(index) {} + ~RocksDBEventListener() {} + virtual void OnStallConditionsChanged(const rocksdb::WriteStallInfo& info) override { + LOG(INFO) << "stall condition changed, rocksdb id: " << index_ + << "column_family name: " << info.cf_name + << " change from stall condition: " << StallEnumToString(info.condition.prev) + << " to stall condition: " << StallEnumToString(info.condition.cur); + } + void OnCompactionCompleted(rocksdb::DB* /*db*/, const rocksdb::CompactionJobInfo& info) override { + LOG(INFO) << "compaction completed, rocksdb id: " << index_ + << " column_family name: " << info.cf_name + << " thread_id: " << info.thread_id + << " job_id: " << info.job_id + << " input level: " << info.base_input_level + << " output level: " << info.output_level + << " elapsed time: " << info.stats.elapsed_micros / 1000 << " ms" + << " total_input_bytes: " << (info.stats.total_input_bytes >> 20) << " MB"; + } + void OnFlushCompleted(rocksdb::DB* /*db*/, + const rocksdb::FlushJobInfo& info) override { + LOG(INFO) << "flush completed, rocksdb id: " << index_ + << " column_family name: " << info.cf_name + << " thread_id: " << info.thread_id + << " job_id: " << info.job_id + << " triggered_writes_slowdown: " << (info.triggered_writes_slowdown ? "true" : "false") + << " triggered_writes_stop: " << (info.triggered_writes_stop ? "true" : "false"); + } +private: + int index_ = 0; }; } // namespace storage diff --git a/src/storage/src/storage.cc b/src/storage/src/storage.cc index 9fb252a7d0..ef9e370efc 100644 --- a/src/storage/src/storage.cc +++ b/src/storage/src/storage.cc @@ -89,12 +89,16 @@ static std::string AppendSubDirectory(const std::string& db_path, int index) { } } -Status Storage::Open(const StorageOptions& storage_options, const std::string& db_path) { +Status Storage::Open(const StorageOptions& storage_options, const std::string& db_path, std::shared_ptr wal_writer) { mkpath(db_path.c_str(), 0755); int inst_count = db_instance_num_; for (int index = 0; index < inst_count; index++) { +#ifdef USE_S3 + insts_.emplace_back(std::make_unique(this, index, wal_writer)); +#else insts_.emplace_back(std::make_unique(this, index)); +#endif Status s = insts_.back()->Open(storage_options, AppendSubDirectory(db_path, index)); if (!s.ok()) { LOG(FATAL) << "open db failed" << s.ToString(); @@ -105,6 +109,16 @@ Status Storage::Open(const StorageOptions& storage_options, const std::string& d return Status::OK(); } +Status Storage::FlushDB() { + for (int index = 0; index < db_instance_num_; index++) { + auto s = insts_[index]->FlushDB(); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + Status Storage::LoadCursorStartKey(const DataType& dtype, int64_t cursor, char* type, std::string* start_key) { std::string index_key = DataTypeTag[dtype] + std::to_string(cursor); std::string index_value; @@ -2315,7 +2329,12 @@ Status Storage::StopScanKeyNum() { return Status::OK(); } + +#ifdef USE_S3 +rocksdb::DBCloud* Storage::GetDBByIndex(int index) { +#else rocksdb::DB* Storage::GetDBByIndex(int index) { +#endif if (index < 0 || index >= db_instance_num_) { LOG(WARNING) << "Invalid DB Index: " << index << "total: " << db_instance_num_; @@ -2397,7 +2416,7 @@ Status Storage::EnableAutoCompaction(const OptionType& option_type, void Storage::GetRocksDBInfo(std::string& info) { char temp[12] = {0}; for (const auto& inst : insts_) { - snprintf(temp, sizeof(temp), "instance:%2d", inst->GetIndex()); + snprintf(temp, sizeof(temp), "instance%2d", inst->GetIndex()); inst->GetRocksDBInfo(info, temp); } } @@ -2444,4 +2463,38 @@ void Storage::DisableWal(const bool is_wal_disable) { } } +#ifdef USE_S3 +Status Storage::SwitchMaster(bool is_old_master, bool is_new_master) { + Status s = Status::OK(); + for (const auto& inst : insts_) { + s = inst->SwitchMaster(is_old_master, is_new_master); + if (!s.ok()) { + LOG(WARNING) << "switch mode failed, when switch from " + << (is_old_master ? "master" : "slave") << " to " + << (is_new_master ? "master" : "slave"); + return s; + } + } + return s; +} + +Status Storage::ApplyWAL(int rocksdb_id, + int type, const std::string& content, + std::unordered_set* redis_keys) { + auto& inst = insts_[rocksdb_id]; + return inst->ApplyWAL(type, content, redis_keys); +} + + +bool Storage::ShouldSkip(int rocksdb_id, const std::string& content) { + auto& inst = insts_[rocksdb_id]; + return inst->ShouldSkip(content); +} + +Status Storage::FlushDBAtSlave(int rocksdb_id) { + auto& inst = insts_[rocksdb_id]; + return inst->FlushDBAtSlave(); +} +#endif + } // namespace storage diff --git a/src/storage/src/zsets_filter.h b/src/storage/src/zsets_filter.h index 8de0e6612b..e28f2d4f5e 100644 --- a/src/storage/src/zsets_filter.h +++ b/src/storage/src/zsets_filter.h @@ -115,7 +115,11 @@ class ZSetsScoreFilter : public rocksdb::CompactionFilter { class ZSetsScoreFilterFactory : public rocksdb::CompactionFilterFactory { public: +#ifdef USE_S3 + ZSetsScoreFilterFactory(rocksdb::DBCloud** db_ptr, std::vector* handles_ptr, int meta_cf_index) +#else ZSetsScoreFilterFactory(rocksdb::DB** db_ptr, std::vector* handles_ptr, int meta_cf_index) +#endif : db_ptr_(db_ptr), cf_handles_ptr_(handles_ptr), meta_cf_index_(meta_cf_index) {} std::unique_ptr CreateCompactionFilter( @@ -126,7 +130,11 @@ class ZSetsScoreFilterFactory : public rocksdb::CompactionFilterFactory { const char* Name() const override { return "ZSetsScoreFilterFactory"; } private: +#ifdef USE_S3 + rocksdb::DBCloud** db_ptr_ = nullptr; +#else rocksdb::DB** db_ptr_ = nullptr; +#endif std::vector* cf_handles_ptr_ = nullptr; int meta_cf_index_ = 0; }; diff --git a/src/storage/tests/CMakeLists.txt b/src/storage/tests/CMakeLists.txt index 09dc7f32cc..063f9b8794 100644 --- a/src/storage/tests/CMakeLists.txt +++ b/src/storage/tests/CMakeLists.txt @@ -33,6 +33,11 @@ foreach(blackwindow_test_source ${BLACKWINDOW_TEST_SOURCE}) PUBLIC ${GFLAGS_LIBRARY} PUBLIC ${LIBUNWIND_LIBRARY} ) + if (USE_S3) + find_package(AWSSDK REQUIRED COMPONENTS s3 transfer kinesis) + target_link_libraries(${blackwindow_test_name} PUBLIC ${AWSSDK_LINK_LIBRARIES}) + endif() + add_test(NAME ${blackwindow_test_name} COMMAND ${blackwindow_test_name} WORKING_DIRECTORY .) diff --git a/src/storage/tests/cloud_clone_test.cc b/src/storage/tests/cloud_clone_test.cc new file mode 100644 index 0000000000..d1438413f0 --- /dev/null +++ b/src/storage/tests/cloud_clone_test.cc @@ -0,0 +1,555 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" + +#include "pstd/include/env.h" +#include "pstd_defer.h" +#include "rocksdb/cloud/db_cloud.h" +#include "src/redis.h" +#include "storage/storage.h" +#include "storage/util.h" + +using namespace storage; +using namespace rocksdb; + + +class CloudTest : public ::testing::Test { +public: + CloudTest() = default; + ~CloudTest() override = default; + + void SetUp() override { + storage_options.options.create_if_missing = true; + storage_options.options.avoid_flush_during_shutdown = true; + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://10.224.129.40:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; + } + + void TearDown() override { + } + + static void SetUpTestSuite() {} + static void TearDownTestSuite() {} + + StorageOptions storage_options; + storage::Status s; + std::string path; +}; + +// This is the local directory where the db is stored. The same +// path name is used to store data inside the specified cloud +// storage bucket. +std::string kDBPath = "db"; + +// This is the local directory where the clone is stored. The same +// pathname is used to store data in the specified cloud bucket. +//std::string kClonePath = "db"; +std::string kClonePath = "clone"; +std::string kBucketSuffix = "cloud.clone.example."; +std::string kBucketSuffix2_src = "cloud2.clone.example."; +std::string kBucketSuffix2_dest = "cloud2.clone.example.dst."; +// +// This is the name of the cloud storage bucket where the db +// is made durable. If you are using AWS, you have to manually +// ensure that this bucket name is unique to you and does not +// conflict with any other S3 users who might have already created +// this bucket name. +// In this example, the database and its clone are both stored in +// the same bucket (obviously with different pathnames). +// + +std::string kRegion = "us-east-1"; + +TEST_F(CloudTest, test_360_s3) { + // cloud environment config options here + CloudFileSystemOptions cloud_fs_options; + + cloud_fs_options.endpoint_override = "beijing2.xstore.qihoo.net"; + cloud_fs_options.credentials.InitializeSimple("YHDIJ1LCITN7YHLETHLW", "fR5b2hEOzeogmiR01FzvYpb9BNt8eSrt0crHy510"); + if (!cloud_fs_options.credentials.HasValid().ok()) { + fprintf( + stderr, + "Please set env variables " + "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY with cloud credentials"); + return; + } + + std::string bucketName = "xx"; + cloud_fs_options.src_bucket.SetBucketName("pulsar-s3-test-beijing2"); + cloud_fs_options.dest_bucket.SetBucketName("pulsar-s3-test-beijing2"); + // Create a new AWS cloud env Status + CloudFileSystem* cfs; + Status s = CloudFileSystem::NewAwsFileSystem( + FileSystem::Default(), "pulsar-s3-test-beijing2", kDBPath, kRegion, "pulsar-s3-test-beijing2", + kDBPath, kRegion, cloud_fs_options, nullptr, &cfs); + if (!s.ok()) { + fprintf(stderr, "Unable to create cloud env in bucket %s. %s\n", + bucketName.c_str(), s.ToString().c_str()); + return; + } + + + // Store a reference to a cloud env. A new cloud env object should be + // associated with every new cloud-db. + auto cloud_env = NewCompositeEnv(std::shared_ptr(cfs)); + + // Create options and use the AWS env that we created earlier + Options options; + options.env = cloud_env.get(); + options.create_if_missing = true; + + // No persistent cache + std::string persistent_cache = ""; + + // Create and Open DB + DBCloud* db = nullptr; + s = DBCloud::Open(options, kDBPath, persistent_cache, 0, &db); + + if (!s.ok()) { + fprintf(stderr, "Unable to open db at path %s in bucket %s. %s\n", + kDBPath.c_str(), bucketName.c_str(), s.ToString().c_str()); + return; + } + + // Put key-value into main db + s = db->Put(WriteOptions(), "key1", "value"); + assert(s.ok()); + std::string value; + + // get value from main db + s = db->Get(ReadOptions(), "key1", &value); + assert(s.ok()); + assert(value == "value"); + + // Flush all data from main db to sst files. + db->Flush(FlushOptions()); + DEFER { + delete db; + }; + + fprintf(stdout, "Successfully used db at %s and clone at %s in bucket %s.\n", + kDBPath.c_str(), kClonePath.c_str(), bucketName.c_str()); +} + +Status CloneDB(const std::string& clone_name, const std::string& src_bucket, + const std::string& src_object_path, + const std::string& dest_bucket, + const std::string& dest_object_path, + const CloudFileSystemOptions& cloud_fs_options, + std::unique_ptr* cloud_db, std::unique_ptr* cloud_env) { + CloudFileSystemOptions cloud_fs_options2; + + cloud_fs_options2.endpoint_override = "http://10.224.129.40:9000"; + cloud_fs_options2.credentials.InitializeSimple("minioadmin", "minioadmin"); + if (!cloud_fs_options2.credentials.HasValid().ok()) { + fprintf( + stderr, + "Please set env variables " + "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY with cloud credentials"); + return rocksdb::Status::OK(); + } + + // Append the user name to the bucket name in an attempt to make it + // globally unique. S3 bucket-namess need to be globlly unique. + // If you want to rerun this example, then unique user-name suffix here. + char* user = getenv("USER"); + kBucketSuffix2_src.append(user); + kBucketSuffix2_dest.append(user); + + const std::string bucketPrefix = "rockset."; + // create a bucket name for debugging purposes + const std::string bucketName_src = bucketPrefix + kBucketSuffix2_src; + const std::string bucketName_dest = bucketPrefix + kBucketSuffix2_src; + //const std::string bucketName_dest = bucketPrefix + kBucketSuffix2_dest; + + // Needed if using bucket prefix other than the default "rockset." + cloud_fs_options2.src_bucket.SetBucketName(kBucketSuffix2_src, bucketPrefix); + cloud_fs_options2.dest_bucket.SetBucketName(kBucketSuffix2_dest, bucketPrefix); + + CloudFileSystem* cfs; + Status st = CloudFileSystem::NewAwsFileSystem( + FileSystem::Default(), src_bucket, src_object_path, kRegion, kBucketSuffix2_src, + dest_object_path, kRegion, cloud_fs_options2, nullptr, &cfs); + + if (!st.ok()) { + fprintf(stderr, + "Unable to create an AWS environment with " + "bucket %s", + src_bucket.c_str()); + return st; + } + std::shared_ptr fs(cfs); + *cloud_env = NewCompositeEnv(fs); + + // Create options and use the AWS env that we created earlier + Options options; + options.env = cloud_env->get(); + + // No persistent cache + std::string persistent_cache = ""; + // open clone + DBCloud* db = nullptr; + st = DBCloud::Open(options, kClonePath, persistent_cache, 0, &db); + if (!st.ok()) { + fprintf(stderr, "Unable to open clone at path %s in bucket %s. %s\n", + kClonePath.c_str(), kBucketSuffix2_src.c_str(), st.ToString().c_str()); + return st; + } + //std::unique_ptr cloud_db2; + cloud_db->reset(db); + return Status::OK(); + +} + +TEST_F(CloudTest, clone_s3) { + // cloud environment config options here + CloudFileSystemOptions cloud_fs_options; + + cloud_fs_options.endpoint_override = "http://10.224.129.40:9000"; + cloud_fs_options.credentials.InitializeSimple("minioadmin", "minioadmin"); + if (!cloud_fs_options.credentials.HasValid().ok()) { + fprintf( + stderr, + "Please set env variables " + "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY with cloud credentials"); + return; + } + + // Append the user name to the bucket name in an attempt to make it + // globally unique. S3 bucket-namess need to be globlly unique. + // If you want to rerun this example, then unique user-name suffix here. + char* user = getenv("USER"); + kBucketSuffix.append(user); + + const std::string bucketPrefix = "rockset."; + // create a bucket name for debugging purposes + const std::string bucketName = bucketPrefix + kBucketSuffix; + + // Needed if using bucket prefix other than the default "rockset." + cloud_fs_options.src_bucket.SetBucketName(kBucketSuffix, bucketPrefix); + cloud_fs_options.dest_bucket.SetBucketName(kBucketSuffix, bucketPrefix); + cloud_fs_options.src_bucket.SetBucketName("database", "pika."); + cloud_fs_options.dest_bucket.SetBucketName("database", "pika."); + // Create a new AWS cloud env Status + CloudFileSystem* cfs; + Status s = CloudFileSystem::NewAwsFileSystem( + FileSystem::Default(), "database", kDBPath, kRegion, "database", + kDBPath, kRegion, cloud_fs_options, nullptr, &cfs); + if (!s.ok()) { + fprintf(stderr, "Unable to create cloud env in bucket %s. %s\n", + bucketName.c_str(), s.ToString().c_str()); + return; + } + + + // Store a reference to a cloud env. A new cloud env object should be + // associated with every new cloud-db. + auto cloud_env = NewCompositeEnv(std::shared_ptr(cfs)); + + // Create options and use the AWS env that we created earlier + Options options; + options.env = cloud_env.get(); + options.create_if_missing = true; + + // No persistent cache + std::string persistent_cache = ""; + + // Create and Open DB + DBCloud* db = nullptr; + s = DBCloud::Open(options, kDBPath, persistent_cache, 0, &db); + + if (!s.ok()) { + fprintf(stderr, "Unable to open db at path %s in bucket %s. %s\n", + kDBPath.c_str(), bucketName.c_str(), s.ToString().c_str()); + return; + } + + // Put key-value into main db + s = db->Put(WriteOptions(), "key1", "value"); + assert(s.ok()); + std::string value; + + // get value from main db + s = db->Get(ReadOptions(), "key1", &value); + assert(s.ok()); + assert(value == "value"); + + // Flush all data from main db to sst files. + db->Flush(FlushOptions()); + + // Create a clone of the db and and verify that all's well. + // In real applications, a Clone would typically be created + // by a separate process. + //std::unique_ptr clone_db; + std::unique_ptr clone_env; + std::unique_ptr clone_db; + s = CloneDB("clone1", kBucketSuffix, kDBPath, kBucketSuffix, kClonePath, + cloud_fs_options, &clone_db, &clone_env); + if (!s.ok()) { + fprintf(stderr, "Unable to clone db at path %s in bucket %s. %s\n", + kDBPath.c_str(), bucketName.c_str(), s.ToString().c_str()); + return; + } + + // insert a key-value in the clone. + s = clone_db->Put(WriteOptions(), "name", "dhruba"); + assert(s.ok()); + + // assert that values from the main db appears in the clone + s = clone_db->Get(ReadOptions(), "key1", &value); + assert(s.ok()); + assert(value == "value"); + + clone_db->Savepoint(); + //clone_db->Flush(FlushOptions()); + clone_db.release(); + +DEFER { +delete db; +}; + + fprintf(stdout, "Successfully used db at %s and clone at %s in bucket %s.\n", + kDBPath.c_str(), kClonePath.c_str(), bucketName.c_str()); +} + +TEST_F(CloudTest, get_clone_s3) { + // cloud environment config options here + CloudFileSystemOptions cloud_fs_options; + + cloud_fs_options.endpoint_override = "http://10.224.129.40:9000"; + cloud_fs_options.credentials.InitializeSimple("minioadmin", "minioadmin"); + if (!cloud_fs_options.credentials.HasValid().ok()) { + fprintf( + stderr, + "Please set env variables " + "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY with cloud credentials"); + return; + } + + // Append the user name to the bucket name in an attempt to make it + // globally unique. S3 bucket-namess need to be globlly unique. + // If you want to rerun this example, then unique user-name suffix here. + char* user = getenv("USER"); + kBucketSuffix.append(user); + + const std::string bucketPrefix = "rockset."; + // create a bucket name for debugging purposes + const std::string bucketName = bucketPrefix + kBucketSuffix; + + // Needed if using bucket prefix other than the default "rockset." + cloud_fs_options.src_bucket.SetBucketName(kBucketSuffix, bucketPrefix); + cloud_fs_options.dest_bucket.SetBucketName(kBucketSuffix, bucketPrefix); + + // Create a new AWS cloud env Status + CloudFileSystem* cfs; + Status s = CloudFileSystem::NewAwsFileSystem( + FileSystem::Default(), kBucketSuffix, kDBPath, kRegion, kBucketSuffix, + kDBPath, kRegion, cloud_fs_options, nullptr, &cfs); + if (!s.ok()) { + fprintf(stderr, "Unable to create cloud env in bucket %s. %s\n", + bucketName.c_str(), s.ToString().c_str()); + return; + } + + + // Store a reference to a cloud env. A new cloud env object should be + // associated with every new cloud-db. + auto cloud_env = NewCompositeEnv(std::shared_ptr(cfs)); + + // Create options and use the AWS env that we created earlier + Options options; + options.env = cloud_env.get(); + options.create_if_missing = true; + + // No persistent cache + std::string persistent_cache = ""; + + // Create and Open DB + DBCloud* db = nullptr; + s = DBCloud::Open(options, kDBPath, persistent_cache, 0, &db); + if (!s.ok()) { + fprintf(stderr, "Unable to open db at path %s in bucket %s. %s\n", + kDBPath.c_str(), bucketName.c_str(), s.ToString().c_str()); + return; + } + + // Put key-value into main db + std::string value; + s = db->Get(ReadOptions(), "name", &value); + std::cout << "value1: " << value << std::endl; + // get value from main db + s = db->Get(ReadOptions(), "key1", &value); + std::cout << "value2: " << value << std::endl; + assert(s.ok()); + assert(value == "value"); + return; +} + +TEST_F(CloudTest, delete_s3) { + // cloud environment config options here + CloudFileSystemOptions cloud_fs_options; + + cloud_fs_options.endpoint_override = "http://10.224.129.40:9000"; + cloud_fs_options.credentials.InitializeSimple("minioadmin", "minioadmin"); + if (!cloud_fs_options.credentials.HasValid().ok()) { + fprintf( + stderr, + "Please set env variables " + "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY with cloud credentials"); + return; + } + + // Append the user name to the bucket name in an attempt to make it + // globally unique. S3 bucket-namess need to be globlly unique. + // If you want to rerun this example, then unique user-name suffix here. + char* user = getenv("USER"); + kBucketSuffix.append(user); + + const std::string bucketPrefix = "rockset."; + // create a bucket name for debugging purposes + const std::string bucketName = bucketPrefix + kBucketSuffix; + + // Needed if using bucket prefix other than the default "rockset." + cloud_fs_options.src_bucket.SetBucketName(kBucketSuffix, bucketPrefix); + cloud_fs_options.dest_bucket.SetBucketName(kBucketSuffix, bucketPrefix); + + // Create a new AWS cloud env Status + CloudFileSystem* cfs; + Status s = CloudFileSystem::NewAwsFileSystem( + FileSystem::Default(), kBucketSuffix, kDBPath, kRegion, kBucketSuffix, + kDBPath, kRegion, cloud_fs_options, nullptr, &cfs); + if (!s.ok()) { + fprintf(stderr, "Unable to create cloud env in bucket %s. %s\n", + bucketName.c_str(), s.ToString().c_str()); + return; + } + + + // Store a reference to a cloud env. A new cloud env object should be + // associated with every new cloud-db. + auto cloud_env = NewCompositeEnv(std::shared_ptr(cfs)); + + // Create options and use the AWS env that we created earlier + Options options; + options.env = cloud_env.get(); + options.create_if_missing = true; + + // No persistent cache + std::string persistent_cache = ""; + + // Create and Open DB + DBCloud* db = nullptr; + s = DBCloud::Open(options, kDBPath, persistent_cache, 0, &db); + if (!s.ok()) { + fprintf(stderr, "Unable to open db at path %s in bucket %s. %s\n", + kDBPath.c_str(), bucketName.c_str(), s.ToString().c_str()); + return; + } + //cfs->DeleteCloudFileFromDest(); + +} +TEST_F(CloudTest, del_bucket_s3) { + Aws::SDKOptions options; + Aws::InitAPI(options); + + Aws::Client::ClientConfiguration cfg; + cfg.endpointOverride = "10.224.129.40:9000"; + cfg.scheme = Aws::Http::Scheme::HTTP; + cfg.verifySSL = false; + + Aws::Auth::AWSCredentials cred("minioadmin", "minioadmin"); // ak,sk + Aws::S3::S3Client s3_client(cred, cfg, + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + false, Aws::S3::US_EAST_1_REGIONAL_ENDPOINT_OPTION::NOT_SET); + + auto response = s3_client.ListBuckets(); + if (response.IsSuccess()) { + auto buckets = response.GetResult().GetBuckets(); + for (auto iter = buckets.begin(); iter != buckets.end(); ++iter) { + std::cout << iter->GetName() << "\t" << iter->GetCreationDate().ToLocalTimeString(Aws::Utils::DateFormat::ISO_8601) << std::endl; + } + } else { + std::cout << "Error while ListBuckets " << response.GetError().GetExceptionName() + << " " << response.GetError().GetMessage() << std::endl; + } + + + + // Aws::S3::S3Client s3_client; + Aws::S3::Model::DeleteBucketRequest request; + request.SetBucket("rockset.cloud2.clone.example.dst.charlieqiao"); + //s3_client.DeleteBucketAsync(request); + + Aws::S3::Model::ListObjectsRequest requ; + requ.SetBucket("rockset.cloud2.clone.example.dst.charlieqiao"); + + bool truncated = false; + do + { + auto outcome = s3_client.ListObjects(requ); + if (outcome.IsSuccess()) + { + std::cout << "list....obinect" << std::endl; + for (const auto& object : outcome.GetResult().GetContents()) + { + Aws::S3::Model::DeleteObjectRequest request; + std::cout << "Folder: " << object.GetKey() << std::endl; + request.SetBucket("rockset.cloud2.clone.example.dst.charlieqiao"); + request.SetKey(object.GetKey()); + auto outcome = s3_client.DeleteObject(request); + if (outcome.IsSuccess()) { + std::cout << "File deleted successfully" << std::endl; + } else { + std::cout << "Failed to delete file:" << outcome.GetError().GetMessage() << std::endl; + } + } + + // 检查是否有下一页 + truncated = outcome.GetResult().GetIsTruncated(); + if (truncated) + { + requ.SetMarker(outcome.GetResult().GetNextMarker()); + } + } + else + { + std::cout << "ListObjects error: " << outcome.GetError().GetMessage() << std::endl; + break; + } + } while (truncated); + + auto outcome = s3_client.DeleteBucket(request); + if (!outcome.IsSuccess()) { + std::cout << "DeleteBucket error: " << outcome.GetError().GetMessage() << std::endl; + } + + Aws::ShutdownAPI(options); +} + +int main(int argc, char** argv) { + if (!pstd::FileExists("./log")) { + pstd::CreatePath("./log"); + } + FLAGS_log_dir = "./log"; + FLAGS_minloglevel = 0; + FLAGS_max_log_size = 1800; + FLAGS_logbufsecs = 0; + ::google::InitGoogleLogging("cloud_clone_test"); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/storage/tests/cloud_test.cc b/src/storage/tests/cloud_test.cc new file mode 100644 index 0000000000..32f29d4e72 --- /dev/null +++ b/src/storage/tests/cloud_test.cc @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include +#include "glog/logging.h" + +#include "pstd/include/env.h" +#include "storage/storage.h" +#include "src/redis.h" +#include "storage/util.h" + +using namespace storage; + +std::queue> items; + +struct MockReplicationListener : public rocksdb::ReplicationLogListener{ + MockReplicationListener() = default; + ~MockReplicationListener() = default; + std::string OnReplicationLogRecord(rocksdb::ReplicationLogRecord record) override { + std::string cnt = std::to_string(counter_.fetch_add(1)); + items.push(std::make_pair(cnt, record)); + LOG(WARNING) << "write binlog, replication_sequence: " << cnt << " type: " << record.type << " items count:" << items.size(); + return cnt; + } + std::atomic counter_ = {0}; +}; + +class CloudTest : public ::testing::Test { +public: + CloudTest() = default; + ~CloudTest() override = default; + + void SetUp() override { + storage_options.options.create_if_missing = true; + storage_options.options.avoid_flush_during_shutdown = true; + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; + } + + void TearDown() override { + } + + static void SetUpTestSuite() {} + static void TearDownTestSuite() {} + + StorageOptions storage_options; + storage::Status s; + std::string path; +}; + +Status OpenMaster(storage::Redis*& inst, StorageOptions storage_options) { + storage::Storage str; + while (!items.empty()) + { + items.pop(); + } + + inst = new storage::Redis(&str, 0); + auto listener = std::make_shared(); + inst->ResetLogListener(listener); + storage_options.cloud_fs_options.is_master = true; + auto s = inst->Open(storage_options, "cloud_test"); + return s; +} + +Status OpenSlave(storage::Redis*& inst, StorageOptions storage_options) { + storage::Storage str; + inst = new storage::Redis(&str, 0); + storage_options.cloud_fs_options.is_master = false; + auto s = inst->Open(storage_options, "cloud_test"); + return s; +} + +TEST_F(CloudTest, simple_master) { + storage::Redis* inst; + auto s = OpenMaster(inst, storage_options); + ASSERT_TRUE(s.ok()); + for (int i = 0; i < 10000; i++) { + if (i + 1 % 100 == 0) { + sleep(1); + } + s = inst->Set(std::to_string(i), std::to_string(i)); + ASSERT_TRUE(s.ok()); + } + rocksdb::FlushOptions fo; + fo.wait = true; + inst->GetDB()->Flush(fo); + delete inst; + inst = nullptr; +} + +Status SlaveCatchUp(storage::Redis* slave) { + Status s; + LOG(WARNING) << "SlaveCatchUp, items.size: " << items.size(); + while (!items.empty()) { + std::string replication_sequence = items.front().first; + auto record = items.front().second; + items.pop(); + LOG(WARNING) << "replication_sequence: " << replication_sequence << " type: " << record.type; + // slave catchup + rocksdb::DB::ApplyReplicationLogRecordInfo info; + s = slave->GetDB()->ApplyReplicationLogRecord(record, replication_sequence, nullptr, true, &info, rocksdb::DB::AR_EVICT_OBSOLETE_FILES); + if (!s.ok()) { + LOG(WARNING) << "reapply log error: " << s.ToString(); + return s; + } + } + return s; +} + +TEST_F(CloudTest, master_slave) { + storage::Redis* inst_master, *inst_slave; + auto s = OpenMaster(inst_master, storage_options); + ASSERT_TRUE(s.ok()); + // master write + for (int i = 0; i < 20000; i++) { + if (i + 1 % 100 == 0) { + sleep(1); + } + s = inst_master->Set(std::to_string(i), std::to_string(i)); + ASSERT_TRUE(s.ok()); + } + + rocksdb::FlushOptions fo; + fo.wait = true; + inst_master->GetDB()->Flush(fo); + delete inst_master; + inst_master = nullptr; + + std::vector children; + pstd::GetChildren("cloud_test", children); + std::for_each(children.begin(), children.end(), [](auto& file) { + if (file.find("sst") != std::string::npos) { + std::string path = "cloud_test/"; + path = path + file; + pstd::DeleteFile(path); + } + }); + + s = OpenSlave(inst_slave, storage_options); + ASSERT_TRUE(s.ok()); + for (int i = 0; i < 20000; i++) { + std::string val; + s = inst_slave->Get(std::to_string(i), &val); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(val, std::to_string(i)); + } + SlaveCatchUp(inst_slave); + + delete inst_slave; + inst_slave = nullptr; + + s = OpenMaster(inst_master, storage_options); + ASSERT_TRUE(s.ok()); + for (int i = 0; i < 20000; i++) { + std::string val; + s = inst_master->Get(std::to_string(i), &val); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(val, std::to_string(i)); + } + delete inst_master; + inst_master = nullptr; +} + +TEST_F(CloudTest, switch_master) { + storage::Redis* inst_master, *inst_slave; + auto s = OpenMaster(inst_master, storage_options); + ASSERT_TRUE(s.ok()); + // master write + for (int i = 0; i < 20000; i++) { + if (i + 1 % 100 == 0) { + sleep(1); + } + s = inst_master->Set(std::to_string(i), std::to_string(i)); + ASSERT_TRUE(s.ok()); + } + + delete inst_master; + inst_master = nullptr; + LOG(WARNING) << "close master already"; + sleep(20); + + std::vector children; + pstd::GetChildren("cloud_test", children); + std::for_each(children.begin(), children.end(), [](auto& file) { + if (file.find("sst") != std::string::npos) { + std::string path = "cloud_test/"; + path = path + file; + pstd::DeleteFile(path); + } + }); + + s = OpenSlave(inst_slave, storage_options); + ASSERT_TRUE(s.ok()); + s = SlaveCatchUp(inst_slave); + ASSERT_TRUE(s.ok()); + for (int i = 0; i < 20000; i++) { + std::string val; + s = inst_slave->Get(std::to_string(i), &val); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(val, std::to_string(i)); + } + s = inst_slave->SwitchMaster(false, true); + ASSERT_TRUE(s.ok()); + delete inst_slave; + inst_slave = nullptr; + + pstd::GetChildren("cloud_test", children); + std::for_each(children.begin(), children.end(), [](auto& file) { + if (file.find("sst") != std::string::npos) { + std::string path = "cloud_test/"; + path = path + file; + pstd::DeleteFile(path); + } + }); + + s = OpenMaster(inst_master, storage_options); + ASSERT_TRUE(s.ok()); + for (int i = 0; i < 20000; i++) { + std::string val; + s = inst_master->Get(std::to_string(i), &val); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(val, std::to_string(i)); + } + delete inst_master; + inst_master = nullptr; +} + +int main(int argc, char** argv) { + if (!pstd::FileExists("./log")) { + pstd::CreatePath("./log"); + } + FLAGS_log_dir = "./log"; + FLAGS_minloglevel = 0; + FLAGS_max_log_size = 1800; + FLAGS_logbufsecs = 0; + ::google::InitGoogleLogging("cloud_test"); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/storage/tests/hashes_test.cc b/src/storage/tests/hashes_test.cc index 50d2207256..d4a1b952fd 100644 --- a/src/storage/tests/hashes_test.cc +++ b/src/storage/tests/hashes_test.cc @@ -29,7 +29,17 @@ class HashesTest : public ::testing::Test { pstd::DeleteDirIfExist(path); mkdir(path.c_str(), 0755); storage_options.options.create_if_missing = true; +#ifdef USE_S3 + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; +#endif s = db.Open(storage_options, path); + ASSERT_TRUE(s.ok()); } void TearDown() override { diff --git a/src/storage/tests/hyperloglog_test.cc b/src/storage/tests/hyperloglog_test.cc index eb03a39514..452e325fb4 100644 --- a/src/storage/tests/hyperloglog_test.cc +++ b/src/storage/tests/hyperloglog_test.cc @@ -23,7 +23,17 @@ class HyperLogLogTest : public ::testing::Test { mkdir(path.c_str(), 0755); } storage_options.options.create_if_missing = true; +#ifdef USE_S3 + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; +#endif s = db.Open(storage_options, path); + ASSERT_TRUE(s.ok()); } void TearDown() override { diff --git a/src/storage/tests/keys_test.cc b/src/storage/tests/keys_test.cc index 8acff5c12f..6395156e19 100644 --- a/src/storage/tests/keys_test.cc +++ b/src/storage/tests/keys_test.cc @@ -29,7 +29,17 @@ class KeysTest : public ::testing::Test { pstd::DeleteDirIfExist(path); mkdir(path.c_str(), 0755); storage_options.options.create_if_missing = true; +#ifdef USE_S3 + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; +#endif s = db.Open(storage_options, path); + ASSERT_TRUE(s.ok()); } void TearDown() override { diff --git a/src/storage/tests/lists_filter_test.cc b/src/storage/tests/lists_filter_test.cc index 694fe66bb6..73c0fa7f83 100644 --- a/src/storage/tests/lists_filter_test.cc +++ b/src/storage/tests/lists_filter_test.cc @@ -27,8 +27,37 @@ class ListsFilterTest : public ::testing::Test { if (access(db_path.c_str(), F_OK) != 0) { mkdir(db_path.c_str(), 0755); } - options.create_if_missing = true; + + options.create_if_missing = true; +#ifdef USE_S3 + // rocksdb-cloud env + rocksdb::CloudFileSystemOptions cloud_fs_opts; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + assert(cloud_fs_opts.credentials.HasValid().ok()); + std::string s3_path = db_path[0] == '.' ? db_path.substr(1) : db_path; + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.src_bucket.SetObjectPath(s3_path); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetObjectPath(s3_path); + rocksdb::CloudFileSystem* cfs = nullptr; + Status s = rocksdb::CloudFileSystem::NewAwsFileSystem( + rocksdb::FileSystem::Default(), + cloud_fs_opts, + nullptr, + &cfs + ); + assert(s.ok()); + std::shared_ptr cloud_fs(cfs); + cloud_env = NewCompositeEnv(cloud_fs); + assert(cloud_env); + options.env = cloud_env.get(); + s = rocksdb::DBCloud::Open(options, db_path, "", 0, &meta_db); +#else s = rocksdb::DB::Open(options, db_path, &meta_db); +#endif + + if (s.ok()) { // create column family rocksdb::ColumnFamilyHandle* cf; @@ -45,7 +74,12 @@ class ListsFilterTest : public ::testing::Test { // Data CF column_families.emplace_back("data_cf", data_cf_ops); +#ifdef USE_S3 + s = rocksdb::DBCloud::Open(options, db_path, column_families, "", 0, &handles, &meta_db); +#else s = rocksdb::DB::Open(options, db_path, column_families, &handles, &meta_db); +#endif + assert(s.ok()); } ~ListsFilterTest() override = default; @@ -58,7 +92,12 @@ class ListsFilterTest : public ::testing::Test { } storage::Options options; +#ifdef USE_S3 + rocksdb::DBCloud* meta_db; + std::unique_ptr cloud_env; +#else rocksdb::DB* meta_db; +#endif storage::Status s; std::vector column_families; diff --git a/src/storage/tests/lists_test.cc b/src/storage/tests/lists_test.cc index ed3325a316..1309c73572 100644 --- a/src/storage/tests/lists_test.cc +++ b/src/storage/tests/lists_test.cc @@ -84,6 +84,15 @@ class ListsTest : public ::testing::Test { pstd::DeleteDirIfExist(path); mkdir(path.c_str(), 0755); storage_options.options.create_if_missing = true; +#ifdef USE_S3 + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; +#endif s = db.Open(storage_options, path); if (!s.ok()) { printf("Open db failed, exit...\n"); diff --git a/src/storage/tests/sets_test.cc b/src/storage/tests/sets_test.cc index c6c4dd220e..1b696c3e9b 100644 --- a/src/storage/tests/sets_test.cc +++ b/src/storage/tests/sets_test.cc @@ -26,7 +26,17 @@ class SetsTest : public ::testing::Test { pstd::DeleteDirIfExist(path); mkdir(path.c_str(), 0755); storage_options.options.create_if_missing = true; +#ifdef USE_S3 + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; +#endif s = db.Open(storage_options, path); + ASSERT_TRUE(s.ok()); } void TearDown() override { diff --git a/src/storage/tests/strings_test.cc b/src/storage/tests/strings_test.cc index 33e15c67ef..724ca6175c 100644 --- a/src/storage/tests/strings_test.cc +++ b/src/storage/tests/strings_test.cc @@ -26,7 +26,17 @@ class StringsTest : public ::testing::Test { pstd::DeleteDirIfExist(path); mkdir(path.c_str(), 0755); storage_options.options.create_if_missing = true; +#ifdef USE_S3 + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; +#endif s = db.Open(storage_options, path); + ASSERT_TRUE(s.ok()); } void TearDown() override { diff --git a/src/storage/tests/zsets_test.cc b/src/storage/tests/zsets_test.cc index 465c48f00e..d1d470ec80 100644 --- a/src/storage/tests/zsets_test.cc +++ b/src/storage/tests/zsets_test.cc @@ -31,6 +31,15 @@ class ZSetsTest : public ::testing::Test { pstd::DeleteDirIfExist(path); mkdir(path.c_str(), 0755); storage_options.options.create_if_missing = true; +#ifdef USE_S3 + auto& cloud_fs_opts = storage_options.cloud_fs_options; + cloud_fs_opts.endpoint_override = "http://127.0.0.1:9000"; + cloud_fs_opts.credentials.InitializeSimple("minioadmin", "minioadmin"); + ASSERT_TRUE(cloud_fs_opts.credentials.HasValid().ok()); + cloud_fs_opts.src_bucket.SetBucketName("database.unit.test", "pika."); + cloud_fs_opts.dest_bucket.SetBucketName("database.unit.test", "pika."); + storage_options.options.max_log_file_size = 0; +#endif s = db.Open(storage_options, path); if (!s.ok()) { printf("Open db failed, exit...\n"); diff --git a/tests/conf/pika.conf b/tests/conf/pika.conf index 5f9167d96a..976f0ad2b6 100644 --- a/tests/conf/pika.conf +++ b/tests/conf/pika.conf @@ -499,3 +499,31 @@ cache-lfu-decay-time: 1 # # aclfile : ../conf/users.acl +###################################################################### +# rocksdb-cloud options +####################################################################### + +# Normally, the AWS SDK will automatically determine the endpoint based on the selected region. +# However, in special cases, you can manually specify the URL of the endpoint through this configuration, +# such as local development. +# Default: "" +cloud-endpoint-override : http://127.0.0.1:9000 + +# The aws access key id and aws secret key used for authentication when accessing aws s3. +cloud-access-key : minioadmin +cloud-secret-key : minioadmin + +# The source bucket name prefix and suffix to use for storage on s3 +# The final bucket name is [prefix][suffix] +# Default: "pika." +# cloud-src-bucket-prefix : +# Default: "database" +cloud-src-bucket-suffix : integration.test + +# The source bucket region +# cloud-src-bucket-region : + +# Configuration information of the destination bucket +# cloud-dest-bucket-prefix : +cloud-dest-bucket-suffix : integration.test +# cloud-dest-bucket-region : diff --git a/tests/gtest/cloud_binlog_test.cc b/tests/gtest/cloud_binlog_test.cc new file mode 100644 index 0000000000..9357d673b8 --- /dev/null +++ b/tests/gtest/cloud_binlog_test.cc @@ -0,0 +1,101 @@ +// +// Created by Bai Xin on 2024/3/11. +// +#include + +#include + +#include "include/pika_binlog_reader.h" +#include "include/pika_cloud_binlog.h" +#include "include/pika_cloud_binlog_transverter.h" + +class CloudBinlogTransverterTest : public ::testing::Test {}; + +class CloudBinlogTest : public ::testing::Test { + public: + CloudBinlogTest() = default; + ~CloudBinlogTest() override = default; + + void SetUp() override { + std::string path = "./cloudbinlog/"; + pstd::DeleteDirIfExist(path); + mkdir(path.c_str(), 0755); + cloudBinlog = std::make_shared(path); + } + + void TearDown() override { + std::string path = "./cloudbinlog"; + pstd::DeleteFile(path.c_str()); + } + + static void SetUpTestSuite() {} + static void TearDownTestSuite() {} + + std::shared_ptr cloudBinlog; +}; + +TEST_F(CloudBinlogTest, GetPutTest) { + pstd::Status s = CloudBinlogTest::cloudBinlog->Put("test", 1, 1); + ASSERT_TRUE(s.ok()); + + PikaBinlogReader binlog_reader; + uint32_t filenum = 0; + uint32_t term = 0; + uint64_t offset = 0; + + s = CloudBinlogTest::cloudBinlog->GetProducerStatus(&filenum, &offset, &term, nullptr); + ASSERT_TRUE(s.ok()); + + s = CloudBinlogTest::cloudBinlog->Put("yyyy", 1, 1); + ASSERT_TRUE(s.ok()); + + int res = binlog_reader.Seek(CloudBinlogTest::cloudBinlog, filenum, offset); + ASSERT_EQ(res, 0); + + std::string binlog; + s = binlog_reader.Get(&binlog, &filenum, &offset); + ASSERT_TRUE(s.ok()); + + cloud::BinlogCloudItem* binlog_item = new cloud::BinlogCloudItem(); + PikaCloudBinlogTransverter::BinlogDecode(binlog, binlog_item); + ASSERT_EQ(1, binlog_item->db_id()); + ASSERT_EQ(1, binlog_item->rocksdb_id()); + ASSERT_STREQ("yyyy", binlog_item->content().c_str()); + + delete binlog_item; +} + +TEST_F(CloudBinlogTransverterTest, CodeTest) { + std::string binlog_item_s = + PikaCloudBinlogTransverter::BinlogEncode(1, 1, 1, 1, 4294967294, 18446744073709551615, "test", 0); + cloud::BinlogCloudItem* binlog_item = new cloud::BinlogCloudItem(); + PikaCloudBinlogTransverter::BinlogDecode(binlog_item_s, binlog_item); + ASSERT_EQ(1, binlog_item->db_id()); + ASSERT_EQ(1, binlog_item->rocksdb_id()); + ASSERT_EQ(1, binlog_item->exec_time()); + ASSERT_EQ(1, binlog_item->term_id()); + ASSERT_EQ(4294967294, binlog_item->file_num()); // 4294967294 = 2^32 - 1 + ASSERT_EQ(18446744073709551615, binlog_item->offset()); // 18446744073709551615 = 2^64 -1 + ASSERT_STREQ("test", binlog_item->content().c_str()); + delete binlog_item; +} + +TEST_F(CloudBinlogTransverterTest, WithoutContentDecodeTest) { + std::string binlog_item_s = + PikaCloudBinlogTransverter::BinlogEncode(1, 1, 1, 1, 4294967294, 18446744073709551615, "test", 0); + cloud::BinlogCloudItem* binlog_item = new cloud::BinlogCloudItem(); + PikaCloudBinlogTransverter::BinlogItemWithoutContentDecode(binlog_item_s, binlog_item); + ASSERT_EQ(1, binlog_item->db_id()); + ASSERT_EQ(1, binlog_item->rocksdb_id()); + ASSERT_EQ(1, binlog_item->exec_time()); + ASSERT_EQ(1, binlog_item->term_id()); + ASSERT_EQ(4294967294, binlog_item->file_num()); // 4294967294 = 2^32 - 1 + ASSERT_EQ(18446744073709551615, binlog_item->offset()); // 18446744073709551615 = 2^64 -1 + ASSERT_STREQ("", binlog_item->content().c_str()); + delete binlog_item; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tools/benchmark_client/CMakeLists.txt b/tools/benchmark_client/CMakeLists.txt index 8cdd683174..7640908d68 100644 --- a/tools/benchmark_client/CMakeLists.txt +++ b/tools/benchmark_client/CMakeLists.txt @@ -13,6 +13,10 @@ add_executable(benchmark_client ${BASE_OBJS}) target_include_directories(benchmark_client PRIVATE ${INSTALL_INCLUDEDIR} PRIVATE ${PROJECT_SOURCE_DIR} ${ROCKSDB_SOURCE_DIR} ${GLOG_INCLUDE_DIR}) target_link_libraries(benchmark_client pthread) +target_link_libraries(benchmark_client curl) +target_link_libraries(benchmark_client z) +target_link_libraries(benchmark_client ${PROMETHEUS_CPP_CORE_LIB}) +target_link_libraries(benchmark_client ${PROMETHEUS_CPP_PULL_LIB}) target_link_libraries(benchmark_client ${ROCKSDB_LIBRARY}) target_link_libraries(benchmark_client ${GLOG_LIBRARY}) target_link_libraries(benchmark_client ${SNAPPY_LIBRARY}) diff --git a/tools/benchmark_client/benchmark_client.cc b/tools/benchmark_client/benchmark_client.cc index 2dbd2bb34e..6c27f0e126 100644 --- a/tools/benchmark_client/benchmark_client.cc +++ b/tools/benchmark_client/benchmark_client.cc @@ -7,6 +7,7 @@ #include #include #include +#include "unistd.h" #include #include #include @@ -16,10 +17,21 @@ #include "monitoring/histogram.h" #include "hiredis/hiredis.h" +#include "prometheus/client_metric.h" +#include "prometheus/histogram.h" +#include "prometheus/family.h" +#include "prometheus/exposer.h" +#include "prometheus/registry.h" + #include "pstd/include/pstd_status.h" #include "pstd/include/pstd_string.h" #include "pstd/include/env.h" +std::function Observer; +std::function Increment; + +using namespace prometheus; + DEFINE_string(command, "generate", "command to execute, eg: generate/get/set/zadd"); DEFINE_bool(pipeline, false, "whether to enable pipeline"); DEFINE_string(host, "127.0.0.1", "target server's host"); @@ -33,6 +45,7 @@ DEFINE_int32(thread_num, 10, "concurrent thread num"); DEFINE_string(dbs, "0", "dbs name, eg: 0,1,2"); DEFINE_int32(element_count, 1, "elements number in hash/list/set/zset"); DEFINE_bool(compare_value, false, "whether compare result or not"); +DEFINE_string(exporter_addr, "0.0.0.0:9999", "metrics exporter listen addr"); using std::default_random_engine; using pstd::Status; @@ -118,10 +131,10 @@ bool CompareValue(const std::string& expect, const std::string& actual) { } void PrepareKeys(int suffix, std::vector* keys) { - keys->resize(FLAGS_count); + keys->resize(FLAGS_count * FLAGS_element_count); std::string filename = "benchmark_keyfile_" + std::to_string(suffix); FILE* fp = fopen(filename.c_str(), "r"); - for (int idx = 0; idx < FLAGS_count; ++idx) { + for (int idx = 0; idx < FLAGS_count * FLAGS_element_count; ++idx) { char* key = new char[FLAGS_key_size + 2]; fgets(key, FLAGS_key_size + 2, fp); key[FLAGS_key_size] = '\0'; @@ -282,6 +295,61 @@ redisContext* Prepare(ThreadArg* arg) { return c; } +void FreeAndReconnect(redisContext*& c, ThreadArg* arg) { + LOG(INFO) << "request timeout, reconnect"; + redisFree(c); + c = nullptr; + while (!c) { + c = Prepare(arg); + } +} + +Status RunBatchGetCommand(redisContext*& c, ThreadArg* arg) { + std::vector keys; + PrepareKeys(arg->idx, &keys); + + for (int idx = 0; idx < FLAGS_count; ++idx) { + if (idx % 10000 == 0) { + LOG(INFO) << "finish " << idx << " mget"; + } + + std::vector get_argv(FLAGS_element_count + 1); + std::vector get_argvlen(FLAGS_element_count + 1); + get_argv[0] = "mget"; + get_argvlen[0] = 4; + for (int i = 0; i < FLAGS_element_count; ++i) { + get_argv[i + 1] = keys[idx * FLAGS_element_count+ i].c_str(); + get_argvlen[i + 1] = keys[idx * FLAGS_element_count+ i].size(); + } + + int retry_times = 0; + while (true) { + redisReply* res = nullptr; + uint64_t begin = pstd::NowMicros(); + res = reinterpret_cast( + redisCommandArgv(c, get_argv.size(), &(get_argv[0]), &(get_argvlen[0]))); + Increment(1); + Observer((pstd::NowMicros() - begin) / 1000.0); + + // nullptr res, reconnect + if (!res) { + FreeAndReconnect(c, arg); + continue; + } + + // success + if (res->type == REDIS_REPLY_ARRAY) { + freeReplyObject(res); + break; + } + + LOG(ERROR) << "mget failed"; + freeReplyObject(res); + } + } + return Status::OK(); +} + Status RunGetCommand(redisContext*& c, ThreadArg* arg) { redisReply* res = nullptr; std::vector keys; @@ -305,7 +373,10 @@ Status RunGetCommand(redisContext*& c, ThreadArg* arg) { res = reinterpret_cast( redisCommandArgv(c, 2, reinterpret_cast(argv), reinterpret_cast(argvlen))); - hist->Add(pstd::NowMicros() - begin); + uint64_t now = pstd::NowMicros(); + Observer((now - begin) / 1000.0); + hist->Add(now - begin); + Increment(1); if (!res) { LOG(INFO) << FLAGS_command << " timeout, key: " << key; @@ -543,30 +614,39 @@ Status RunSetCommand(redisContext*& c, ThreadArg* arg) { std::vector keys; PrepareKeys(arg->idx, &keys); - for (int idx = 0; idx < FLAGS_count; ++idx) { + for (int idx = 0; idx < FLAGS_count * FLAGS_element_count; ++idx) { if (idx % 10000 == 0) { LOG(INFO) << "finish " << idx << " request"; } - const char* set_argv[3]; - size_t set_argvlen[3]; + const char* set_argv[4]; + size_t set_argvlen[4]; std::string value; std::string key = keys[idx]; GenerateValue(key, FLAGS_value_size, &value); + std::string expire_seconds = "86400"; - set_argv[0] = "set"; - set_argvlen[0] = 3; + set_argv[0] = "setex"; + set_argvlen[0] = 5; set_argv[1] = key.c_str(); set_argvlen[1] = key.size(); - set_argv[2] = value.c_str(); - set_argvlen[2] = value.size(); + set_argv[2] = expire_seconds.c_str(); + set_argvlen[2] = expire_seconds.size(); + set_argv[3] = value.c_str(); + set_argvlen[3] = value.size(); uint64_t begin = pstd::NowMicros(); res = reinterpret_cast( - redisCommandArgv(c, 3, reinterpret_cast(set_argv), + redisCommandArgv(c, 4, reinterpret_cast(set_argv), reinterpret_cast(set_argvlen))); - hist->Add(pstd::NowMicros() - begin); + uint64_t now = pstd::NowMicros(); + if (now - begin > 10 * 1000) { + LOG(ERROR) << "setex costs " << (now - begin) / 1000 << " ms"; + } + Observer((now - begin) / 1000.0); + hist->Add(now - begin); + Increment(1); if (!res) { LOG(INFO) << FLAGS_command << " timeout, key: " << key; @@ -578,7 +658,7 @@ Status RunSetCommand(redisContext*& c, ThreadArg* arg) { } } else if (res->type != REDIS_REPLY_STATUS) { LOG(INFO) << FLAGS_command << " invalid type: " << res->type - << " key: " << key; + << " key: " << key << " response str: " << res->str; arg->stat.error_cnt++; } else { arg->stat.success_cnt++; @@ -808,7 +888,9 @@ void* ThreadMain(void* arg) { } Status s; - if (FLAGS_command == "get") { + if (FLAGS_command == "mget") { + s = RunBatchGetCommand(c, ta); + } else if (FLAGS_command == "get") { s = RunGetCommand(c, ta); } else if (FLAGS_command == "set") { s = RunSetCommand(c, ta); @@ -844,6 +926,41 @@ int main(int argc, char* argv[]) { if (tables.empty()) { exit(-1); } + char host_name[255]; + if (gethostname(host_name, sizeof(host_name)) == -1) { + std::cout << "get hostname failed, exit"; + exit(1); + } + std::string bind_addr = FLAGS_exporter_addr; + Exposer exposer{bind_addr}; + auto registry = std::make_shared(); + exposer.RegisterCollectable(registry, "/metrics"); + + auto& counter_family = BuildCounter() + .Name("request_count") + .Help("How many is the api called") + .Labels({{"hostname", host_name}, {"command", FLAGS_command}}) + .Register(*registry); + + auto& api_counter = counter_family.Add( + {{"prometheus_test_counter", "test_counter"}, {"yet_another_label", "value"}}); + Increment = [&api_counter](double cost) { + api_counter.Increment(cost); + }; + + auto& histogram_family = BuildHistogram() + .Name("request_time") + .Help("analyze the time of request duraiton with histogram") + .Labels({{"hostname", host_name}, {"command", FLAGS_command}}) + .Register(*registry); + auto& task_histogram = histogram_family.Add({{"prometheus_test_histogram", "test_histogram"}, + {"yet_another_lable", "value"}}, Histogram::BucketBoundaries{1, 2, 3, 4, 5, 6, 7, + 8, 10, 12, 14, 17, 20, 24, 29, 34, 40, 48, 57, 68, 81, 96, 114, 135, 160, 190, 226, 268, 318, + 378, 449, 533, 633, 752, 894, 1062, 1262, 1500, 1782, 2117, 2516, 2990, 3553, 4222, 5017, 5961, + 7083, 8416, 10000}); + Observer = [&task_histogram](double cost) { + task_histogram.Observe(cost); + }; FLAGS_logtostdout = true; FLAGS_minloglevel = 0; @@ -885,4 +1002,4 @@ int main(int argc, char* argv[]) { std::cout << "Timeout Count: " << stat.timeout_cnt << " Error Count: " << stat.error_cnt << std::endl; std::cout << "stats: " << hist->ToString() << std::endl; return 0; -} +} \ No newline at end of file