From a7038de4e96b6e5f4162aa15aa0f1d9964464ec3 Mon Sep 17 00:00:00 2001 From: Steve Leung Date: Thu, 16 Jul 2020 15:10:20 -0600 Subject: [PATCH] Tests for running transforms in multiple streams and threads * rocfft-test: add simple multithread/multistream tests --- clients/client_utils.h | 3 + clients/tests/CMakeLists.txt | 4 +- clients/tests/multithread_test.cpp | 309 +++++++++++++++++++++++++++++ clients/tests/unit_test.cpp | 65 ------ 4 files changed, 315 insertions(+), 66 deletions(-) create mode 100644 clients/tests/multithread_test.cpp diff --git a/clients/client_utils.h b/clients/client_utils.h index a4db762c..5543fd94 100644 --- a/clients/client_utils.h +++ b/clients/client_utils.h @@ -23,8 +23,11 @@ #include #include +#include #include #include +#include +#include #include "rocfft.h" #include diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 8650a858..67e675af 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -41,6 +41,7 @@ set(rocfft_test_source accuracy_test_1D.cpp accuracy_test_2D.cpp accuracy_test_3D.cpp + multithread_test.cpp unit_test.cpp hipfft_test.cpp misc/source/test_exception.cpp @@ -103,6 +104,7 @@ endif() set_target_properties( rocfft-test PROPERTIES DEBUG_POSTFIX "-d" - CXX_EXTENSIONS NO ) + CXX_EXTENSIONS NO + CXX_STANDARD 14 ) set_target_properties( rocfft-test PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) diff --git a/clients/tests/multithread_test.cpp b/clients/tests/multithread_test.cpp new file mode 100644 index 00000000..ff9f8cde --- /dev/null +++ b/clients/tests/multithread_test.cpp @@ -0,0 +1,309 @@ +// Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +#include "../client_utils.h" +#include "rocfft.h" +#include "rocfft_against_fftw.h" +#include +#include +#include +#include +#include +#include + +// normalize results of an inverse transform, so it can be directly +// compared to the original data before the forward transform +__global__ void normalize_inverse_results(float2* array, float N) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + array[idx].x /= N; + array[idx].y /= N; +} + +// Run a transform of specified dimensions, size N on each dimension. +// Data is randomly generated based on the seed value, and we do a +// forward + inverse transform and compare against what we started +// with. +struct Test_Transform +{ + // real constructor sets all the data up and creates the plans + Test_Transform(size_t _N, size_t _dim, uint32_t _seed) + : N(_N) + , dim(_dim) + , seed(_seed) + { + // compute total data size + size_t datasize = 1; + for(size_t i = 0; i < dim; ++i) + { + datasize *= N; + } + + size_t Nbytes = datasize * sizeof(float2); + + // Create HIP device buffers + EXPECT_EQ(hipMalloc(&device_mem_in, Nbytes), hipSuccess); + EXPECT_EQ(hipMalloc(&device_mem_out, Nbytes), hipSuccess); + + // Initialize data + std::minstd_rand gen(seed); + std::uniform_real_distribution dist(-1.0f, 1.0f); + host_mem_in.resize(datasize); + host_mem_out.resize(datasize); + for(size_t i = 0; i < datasize; i++) + { + host_mem_in[i].x = dist(gen); + host_mem_in[i].y = dist(gen); + } + + // Copy data to device + EXPECT_EQ(hipMemcpy(device_mem_in, host_mem_in.data(), Nbytes, hipMemcpyHostToDevice), + hipSuccess); + } + Test_Transform(const Test_Transform&) = delete; + void operator=(const Test_Transform&) = delete; + Test_Transform(Test_Transform&& other) + { + this->stream = other.stream; + other.stream = nullptr; + this->work_buffer = other.work_buffer; + other.work_buffer = nullptr; + this->device_mem_in = other.device_mem_in; + other.device_mem_in = nullptr; + this->device_mem_out = other.device_mem_out; + other.device_mem_out = nullptr; + this->host_mem_in.swap(other.host_mem_in); + this->host_mem_out.swap(other.host_mem_out); + } + + void run_transform() + { + // Create rocFFT plans (forward + inverse) + std::vector lengths(dim, N); + EXPECT_EQ(rocfft_plan_create(&plan, + rocfft_placement_notinplace, + rocfft_transform_type_complex_forward, + rocfft_precision_single, + dim, + lengths.data(), + 1, + nullptr), + rocfft_status_success); + + EXPECT_EQ(rocfft_plan_create(&plan_inv, + rocfft_placement_inplace, + rocfft_transform_type_complex_inverse, + rocfft_precision_single, + dim, + lengths.data(), + 1, + nullptr), + rocfft_status_success); + + // allocate work buffer if necessary + EXPECT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success); + // NOTE: assuming that same-sized work buffer is ok for both + // forward and inverse transforms + if(work_buffer_size) + { + EXPECT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess); + } + + EXPECT_EQ(hipStreamCreate(&stream), hipSuccess); + rocfft_execution_info info; + EXPECT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); + EXPECT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); + EXPECT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer, work_buffer_size), + rocfft_status_success); + + // Execute forward plan out-of-place + EXPECT_EQ(rocfft_execute(plan, &device_mem_in, &device_mem_out, info), + rocfft_status_success); + // Execute inverse plan in-place + EXPECT_EQ(rocfft_execute(plan_inv, &device_mem_out, nullptr, info), rocfft_status_success); + + EXPECT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success); + + // Apply normalization so the values really are comparable + hipLaunchKernelGGL(normalize_inverse_results, + host_mem_out.size(), + 1, + 0, // sharedMemBytes + stream, // stream + static_cast(device_mem_out), + static_cast(host_mem_out.size())); + ran_transform = true; + } + + void do_cleanup() + { + // complain loudly if we set up for a transform but did not + // actually run it + if(plan && !ran_transform) + ADD_FAILURE(); + + // wait for execution to finish + if(stream) + { + EXPECT_EQ(hipStreamSynchronize(stream), hipSuccess); + EXPECT_EQ(hipStreamDestroy(stream), hipSuccess); + stream = nullptr; + } + + EXPECT_EQ(hipFree(work_buffer), hipSuccess); + work_buffer = nullptr; + + EXPECT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); + plan = nullptr; + EXPECT_EQ(rocfft_plan_destroy(plan_inv), rocfft_status_success); + plan_inv = nullptr; + + // Copy result back to host + if(device_mem_out && !host_mem_out.empty()) + { + EXPECT_EQ(hipMemcpy(host_mem_out.data(), + device_mem_out, + host_mem_out.size() * sizeof(float2), + hipMemcpyDeviceToHost), + hipSuccess); + + // compare data we got to the original + auto diff = LinfL2diff_1to1_complex( + reinterpret_cast*>(host_mem_in.data()), + reinterpret_cast*>(host_mem_out.data()), + // data is all contiguous, we can treat it as 1d + host_mem_in.size(), + 1, + 1, + host_mem_in.size(), + 1, + host_mem_out.size()); + // we're running 2 transforms (forward+inverse), so we + // should tolerate 2x the error of a single transform + const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); + EXPECT_LT(diff.first, MAX_TRANSFORM_ERROR); + EXPECT_LT(diff.second, MAX_TRANSFORM_ERROR); + + // Free device buffers + EXPECT_EQ(hipFree(device_mem_in), hipSuccess); + device_mem_in = nullptr; + EXPECT_EQ(hipFree(device_mem_out), hipSuccess); + device_mem_out = nullptr; + host_mem_in.clear(); + host_mem_out.clear(); + } + } + ~Test_Transform() + { + do_cleanup(); + } + size_t N = 0; + size_t dim = 0; + uint32_t seed = 0; + hipStream_t stream = nullptr; + rocfft_plan plan = nullptr; + rocfft_plan plan_inv = nullptr; + size_t work_buffer_size = 0; + void* work_buffer = nullptr; + void* device_mem_in = nullptr; + void* device_mem_out = nullptr; + std::vector host_mem_in; + std::vector host_mem_out; + + // ensure that we don't forget to actually run the transform + bool ran_transform = false; +}; + +// run concurrent transforms, one per thread, size N on each dimension +static void multithread_transform(size_t N, size_t dim, size_t num_threads) +{ + std::vector threads; + threads.reserve(num_threads); + for(size_t j = 0; j < num_threads; ++j) + { + threads.emplace_back([=]() { + Test_Transform t(N, dim, j); + t.run_transform(); + }); + } + for(auto& t : threads) + t.join(); +} + +// for multi-stream tests, set up a bunch of streams, then execute +// all of those transforms from a single thread. afterwards, +// wait/verify/cleanup in parallel to save wall time during the test. +static void multistream_transform(size_t N, size_t dim, size_t num_streams) +{ + std::vector> transforms; + transforms.resize(num_streams); + std::vector threads; + threads.reserve(num_streams); + + // get all data ready in parallel + for(size_t i = 0; i < num_streams; ++i) + threads.emplace_back( + [=, &transforms]() { transforms[i] = std::make_unique(N, dim, i); }); + for(auto& t : threads) + t.join(); + threads.clear(); + + // now start the actual transforms serially, but in separate + // streams + for(auto& t : transforms) + t->run_transform(); + + // clean up + for(size_t i = 0; i < transforms.size(); ++i) + threads.emplace_back([=, &transforms]() { transforms[i]->do_cleanup(); }); + for(auto& t : threads) + t.join(); +} + +// pick arbitrary sizes here to get some parallelism while still +// fitting into e.g. 8 GB of GPU memory +TEST(rocfft_UnitTest, simple_multithread_1D) +{ + multithread_transform(1048576, 1, 64); +} + +TEST(rocfft_UnitTest, simple_multithread_2D) +{ + multithread_transform(1024, 2, 64); +} + +TEST(rocfft_UnitTest, simple_multithread_3D) +{ + multithread_transform(128, 3, 40); +} + +TEST(rocfft_UnitTest, simple_multistream_1D) +{ + multistream_transform(1048576, 1, 32); +} + +TEST(rocfft_UnitTest, simple_multistream_2D) +{ + multistream_transform(1024, 2, 32); +} + +TEST(rocfft_UnitTest, simple_multistream_3D) +{ + multistream_transform(128, 3, 32); +} diff --git a/clients/tests/unit_test.cpp b/clients/tests/unit_test.cpp index f0b6cc7b..da058222 100644 --- a/clients/tests/unit_test.cpp +++ b/clients/tests/unit_test.cpp @@ -31,71 +31,6 @@ #include #include -TEST(rocfft_UnitTest, sample_code_iterative) -{ - for(size_t j = 0; j < 10; j++) - { - // rocFFT gpu compute - // ======================================== - - size_t N = 16; - size_t Nbytes = N * sizeof(float2); - - // Create HIP device buffer - float2* x; - hipMalloc(&x, Nbytes); - - // Initialize data - std::vector cx(N); - for(size_t i = 0; i < N; i++) - { - cx[i].x = 1; - cx[i].y = -1; - } - - // Copy data to device - hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); - - // Create rocFFT plan - rocfft_plan plan = NULL; - size_t length = N; - rocfft_plan_create(&plan, - rocfft_placement_inplace, - rocfft_transform_type_complex_forward, - rocfft_precision_single, - 1, - &length, - 1, - NULL); - - // Execute plan - rocfft_execute(plan, (void**)&x, NULL, NULL); - - // Wait for execution to finish - hipDeviceSynchronize(); - - // Destroy plan - rocfft_plan_destroy(plan); - - // Copy result back to host - std::vector y(N); - hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost); - - // Check results - EXPECT_EQ((float)N, y[0].x); - EXPECT_EQ(-(float)N, y[0].y); - - for(size_t i = 1; i < N; i++) - { - EXPECT_EQ(0, y[i].x); - EXPECT_EQ(0, y[i].y); - } - - // Free device buffer - hipFree(x); - } -} - TEST(rocfft_UnitTest, plan_description) { rocfft_plan_description desc = nullptr;