Skip to content

Commit

Permalink
Tests for running transforms in multiple streams and threads
Browse files Browse the repository at this point in the history
* rocfft-test: add simple multithread/multistream tests
  • Loading branch information
evetsso authored Jul 16, 2020
1 parent 7a1d965 commit a7038de
Show file tree
Hide file tree
Showing 4 changed files with 315 additions and 66 deletions.
3 changes: 3 additions & 0 deletions clients/client_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@

#include <algorithm>
#include <complex>
#include <iostream>
#include <numeric>
#include <omp.h>
#include <tuple>
#include <vector>

#include "rocfft.h"
#include <hip/hip_runtime_api.h>
Expand Down
4 changes: 3 additions & 1 deletion clients/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ set(rocfft_test_source
accuracy_test_1D.cpp
accuracy_test_2D.cpp
accuracy_test_3D.cpp
multithread_test.cpp
unit_test.cpp
hipfft_test.cpp
misc/source/test_exception.cpp
Expand Down Expand Up @@ -103,6 +104,7 @@ endif()

set_target_properties( rocfft-test
PROPERTIES DEBUG_POSTFIX "-d"
CXX_EXTENSIONS NO )
CXX_EXTENSIONS NO
CXX_STANDARD 14 )
set_target_properties( rocfft-test
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
309 changes: 309 additions & 0 deletions clients/tests/multithread_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
// Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "../client_utils.h"
#include "rocfft.h"
#include "rocfft_against_fftw.h"
#include <gtest/gtest.h>
#include <hip/hip_runtime.h>
#include <memory>
#include <random>
#include <thread>
#include <vector>

// normalize results of an inverse transform, so it can be directly
// compared to the original data before the forward transform
__global__ void normalize_inverse_results(float2* array, float N)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
array[idx].x /= N;
array[idx].y /= N;
}

// Run a transform of specified dimensions, size N on each dimension.
// Data is randomly generated based on the seed value, and we do a
// forward + inverse transform and compare against what we started
// with.
struct Test_Transform
{
// real constructor sets all the data up and creates the plans
Test_Transform(size_t _N, size_t _dim, uint32_t _seed)
: N(_N)
, dim(_dim)
, seed(_seed)
{
// compute total data size
size_t datasize = 1;
for(size_t i = 0; i < dim; ++i)
{
datasize *= N;
}

size_t Nbytes = datasize * sizeof(float2);

// Create HIP device buffers
EXPECT_EQ(hipMalloc(&device_mem_in, Nbytes), hipSuccess);
EXPECT_EQ(hipMalloc(&device_mem_out, Nbytes), hipSuccess);

// Initialize data
std::minstd_rand gen(seed);
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
host_mem_in.resize(datasize);
host_mem_out.resize(datasize);
for(size_t i = 0; i < datasize; i++)
{
host_mem_in[i].x = dist(gen);
host_mem_in[i].y = dist(gen);
}

// Copy data to device
EXPECT_EQ(hipMemcpy(device_mem_in, host_mem_in.data(), Nbytes, hipMemcpyHostToDevice),
hipSuccess);
}
Test_Transform(const Test_Transform&) = delete;
void operator=(const Test_Transform&) = delete;
Test_Transform(Test_Transform&& other)
{
this->stream = other.stream;
other.stream = nullptr;
this->work_buffer = other.work_buffer;
other.work_buffer = nullptr;
this->device_mem_in = other.device_mem_in;
other.device_mem_in = nullptr;
this->device_mem_out = other.device_mem_out;
other.device_mem_out = nullptr;
this->host_mem_in.swap(other.host_mem_in);
this->host_mem_out.swap(other.host_mem_out);
}

void run_transform()
{
// Create rocFFT plans (forward + inverse)
std::vector<size_t> lengths(dim, N);
EXPECT_EQ(rocfft_plan_create(&plan,
rocfft_placement_notinplace,
rocfft_transform_type_complex_forward,
rocfft_precision_single,
dim,
lengths.data(),
1,
nullptr),
rocfft_status_success);

EXPECT_EQ(rocfft_plan_create(&plan_inv,
rocfft_placement_inplace,
rocfft_transform_type_complex_inverse,
rocfft_precision_single,
dim,
lengths.data(),
1,
nullptr),
rocfft_status_success);

// allocate work buffer if necessary
EXPECT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success);
// NOTE: assuming that same-sized work buffer is ok for both
// forward and inverse transforms
if(work_buffer_size)
{
EXPECT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess);
}

EXPECT_EQ(hipStreamCreate(&stream), hipSuccess);
rocfft_execution_info info;
EXPECT_EQ(rocfft_execution_info_create(&info), rocfft_status_success);
EXPECT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success);
EXPECT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer, work_buffer_size),
rocfft_status_success);

// Execute forward plan out-of-place
EXPECT_EQ(rocfft_execute(plan, &device_mem_in, &device_mem_out, info),
rocfft_status_success);
// Execute inverse plan in-place
EXPECT_EQ(rocfft_execute(plan_inv, &device_mem_out, nullptr, info), rocfft_status_success);

EXPECT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success);

// Apply normalization so the values really are comparable
hipLaunchKernelGGL(normalize_inverse_results,
host_mem_out.size(),
1,
0, // sharedMemBytes
stream, // stream
static_cast<float2*>(device_mem_out),
static_cast<float>(host_mem_out.size()));
ran_transform = true;
}

void do_cleanup()
{
// complain loudly if we set up for a transform but did not
// actually run it
if(plan && !ran_transform)
ADD_FAILURE();

// wait for execution to finish
if(stream)
{
EXPECT_EQ(hipStreamSynchronize(stream), hipSuccess);
EXPECT_EQ(hipStreamDestroy(stream), hipSuccess);
stream = nullptr;
}

EXPECT_EQ(hipFree(work_buffer), hipSuccess);
work_buffer = nullptr;

EXPECT_EQ(rocfft_plan_destroy(plan), rocfft_status_success);
plan = nullptr;
EXPECT_EQ(rocfft_plan_destroy(plan_inv), rocfft_status_success);
plan_inv = nullptr;

// Copy result back to host
if(device_mem_out && !host_mem_out.empty())
{
EXPECT_EQ(hipMemcpy(host_mem_out.data(),
device_mem_out,
host_mem_out.size() * sizeof(float2),
hipMemcpyDeviceToHost),
hipSuccess);

// compare data we got to the original
auto diff = LinfL2diff_1to1_complex(
reinterpret_cast<const std::complex<float>*>(host_mem_in.data()),
reinterpret_cast<const std::complex<float>*>(host_mem_out.data()),
// data is all contiguous, we can treat it as 1d
host_mem_in.size(),
1,
1,
host_mem_in.size(),
1,
host_mem_out.size());
// we're running 2 transforms (forward+inverse), so we
// should tolerate 2x the error of a single transform
const double MAX_TRANSFORM_ERROR = 2 * type_epsilon<float>();
EXPECT_LT(diff.first, MAX_TRANSFORM_ERROR);
EXPECT_LT(diff.second, MAX_TRANSFORM_ERROR);

// Free device buffers
EXPECT_EQ(hipFree(device_mem_in), hipSuccess);
device_mem_in = nullptr;
EXPECT_EQ(hipFree(device_mem_out), hipSuccess);
device_mem_out = nullptr;
host_mem_in.clear();
host_mem_out.clear();
}
}
~Test_Transform()
{
do_cleanup();
}
size_t N = 0;
size_t dim = 0;
uint32_t seed = 0;
hipStream_t stream = nullptr;
rocfft_plan plan = nullptr;
rocfft_plan plan_inv = nullptr;
size_t work_buffer_size = 0;
void* work_buffer = nullptr;
void* device_mem_in = nullptr;
void* device_mem_out = nullptr;
std::vector<float2> host_mem_in;
std::vector<float2> host_mem_out;

// ensure that we don't forget to actually run the transform
bool ran_transform = false;
};

// run concurrent transforms, one per thread, size N on each dimension
static void multithread_transform(size_t N, size_t dim, size_t num_threads)
{
std::vector<std::thread> threads;
threads.reserve(num_threads);
for(size_t j = 0; j < num_threads; ++j)
{
threads.emplace_back([=]() {
Test_Transform t(N, dim, j);
t.run_transform();
});
}
for(auto& t : threads)
t.join();
}

// for multi-stream tests, set up a bunch of streams, then execute
// all of those transforms from a single thread. afterwards,
// wait/verify/cleanup in parallel to save wall time during the test.
static void multistream_transform(size_t N, size_t dim, size_t num_streams)
{
std::vector<std::unique_ptr<Test_Transform>> transforms;
transforms.resize(num_streams);
std::vector<std::thread> threads;
threads.reserve(num_streams);

// get all data ready in parallel
for(size_t i = 0; i < num_streams; ++i)
threads.emplace_back(
[=, &transforms]() { transforms[i] = std::make_unique<Test_Transform>(N, dim, i); });
for(auto& t : threads)
t.join();
threads.clear();

// now start the actual transforms serially, but in separate
// streams
for(auto& t : transforms)
t->run_transform();

// clean up
for(size_t i = 0; i < transforms.size(); ++i)
threads.emplace_back([=, &transforms]() { transforms[i]->do_cleanup(); });
for(auto& t : threads)
t.join();
}

// pick arbitrary sizes here to get some parallelism while still
// fitting into e.g. 8 GB of GPU memory
TEST(rocfft_UnitTest, simple_multithread_1D)
{
multithread_transform(1048576, 1, 64);
}

TEST(rocfft_UnitTest, simple_multithread_2D)
{
multithread_transform(1024, 2, 64);
}

TEST(rocfft_UnitTest, simple_multithread_3D)
{
multithread_transform(128, 3, 40);
}

TEST(rocfft_UnitTest, simple_multistream_1D)
{
multistream_transform(1048576, 1, 32);
}

TEST(rocfft_UnitTest, simple_multistream_2D)
{
multistream_transform(1024, 2, 32);
}

TEST(rocfft_UnitTest, simple_multistream_3D)
{
multistream_transform(128, 3, 32);
}
Loading

0 comments on commit a7038de

Please sign in to comment.