Hi all,
I recently started to use unified memory, because it seems to offer the cleanest way to pass any struct containing device pointers to a kernel by reference.
I wrote a small program (later described in more detail) to replicate my current project in the simplest way possible and to measure the performance using unified memory.
The result was that the performance of my test kernel would drop by about 20-25% if the struct was passed by reference, more or less independent of the data size. The same happens if I pass the struct via pointer.
Even though the test kernel may not represent a real scenario, 25% seems a lot to me and so far I could not find a flaw in my testing. Of course passing struct containing pointer would end up in a big mess so I would like to avoid it. I’m using a pascal Quadro p1000m.
So my question is, as to why that is and maybe how to avoid it, since I could not find anything on this topic?
This is how I measured the performance:
I implemented two structs, one called buffer which contains a device pointer, pointing to unified memory and the other called parent - containing two buffer objects.
I used a kernel, which assigns the output of a small test function for each element in one buffer to the corresponding element in the other.
Finally, I compared the performance when passing the struct by value/ by reference to the same kernel only using two float arrays of the same size.
Copy constructor and destructor are commented out on purpose.
Many thanks and all the best from munich!
Here is my code (I just changed back and forth between value, reference and pointer):
#include "cuda_runtime.h"
#include <iostream>
#include <chrono>
class Managed {
public:
void *operator new(size_t len) {
void *ptr;
cudaMallocManaged(&ptr, len);
cudaDeviceSynchronize();
return ptr;
}
void operator delete(void *ptr) {
cudaDeviceSynchronize();
cudaFree(ptr);
}
};
#define N 100000000
constexpr int block_size = 256;
constexpr int grid_size = (N + block_size -1 ) / block_size;
constexpr int num_iter = 10;
struct buffer : Managed
{
int m_size;
float* m_pointer;
buffer(): m_pointer(nullptr), m_size(0){}
buffer(int size): m_pointer(nullptr), m_size(size)
{
cudaMallocManaged(&m_pointer, m_size * sizeof(float));
}
// buffer (const buffer &rhs) {
// m_size = rhs.m_size;
// cudaMallocManaged(&m_pointer, m_size*sizeof(float));
// memcpy(m_pointer, rhs.m_pointer, m_size*sizeof(float));
// }
~buffer()
{
// cudaFree(m_pointer);
}
};
struct parent : Managed
{
buffer buffer1;
buffer buffer2;
parent(int size): buffer1(size),buffer2(size) {}
};
__device__ __forceinline__ float test_func(float x)
{
return sqrtf(x) / expf(x) * 10 + 100;
}
__global__ void kernel(float *arr,float* arr_dst)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x ;
if (idx >= N){return;}
arr_dst[idx] = test_func(arr[idx]) ;
}
__global__ void kernel(parent& instance)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x ;
if (idx >= N){return;}
instance.buffer1.m_pointer[idx] = test_func(instance.buffer2.m_pointer[idx]) ;
}
int main()
{
parent* buf = new parent(N);
float * arr, *arr_dst;
cudaMallocManaged(&arr, N * sizeof(float));
cudaMallocManaged(&arr_dst, N * sizeof(float));
kernel<<<grid_size,block_size>>>(*buf);
kernel<<<grid_size,block_size>>>(arr, arr_dst, N);
cudaDeviceSynchronize();
auto start = std::chrono::high_resolution_clock::now();
for ( int i = 0; i< num_iter; ++i){
kernel<<<grid_size,block_size>>>(arr,arr_dst, N);
cudaDeviceSynchronize();
}
auto stop = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
start = std::chrono::high_resolution_clock::now();
for ( int i = 0; i< num_iter; ++i){
kernel<<<grid_size,block_size>>>(*buf);
cudaDeviceSynchronize();
}
stop = std::chrono::high_resolution_clock::now();
auto elapsed1 = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
std::cout << "struct by value/reference "<< elapsed1 << " arrays "<< elapsed << " " << (double)elapsed1 / (double)elapsed << std::endl;
cudaFree(arr);
cudaFree(arr_dst);
cudaFree(buf->buffer1.m_pointer);
cudaFree(buf->buffer2.m_pointer);
cudaFree(buf);
return 0;
}