Struct vs. parameters performance difference

Hello. I’m pretty new to Cuda, and I’ve got a question about a performance difference I’m seeing.

When I pass in all my data as a bunch of arrays as parameters to my kernel function, I’m seeing the best performance.

But, when I passed it in as an array of structs, I saw worse performance. But then I learned about coalescing, so then I refactored my struct to contain a bunch of arrays and passed in a single struct that contained a bunch of arrays and that greatly improved performance.

But I’m still seeing a difference in performance between the coalesced struct parameters method, and the bunch-of-arrays parameter method that I don’t understand.

Do you folks know why passing in a bunch of parameters would still result in better performance than passing in a struct that contain a bunch of coalesced data?

This was done in google colab using a T4 GPU

code (bunch-of-parameters):

%%writefile TestProg.cu

include
include <math.h>
include <assert.h>
include “cuda_fp16.h”

using namespace std;

define MAX_NUM_THREADS_PER_BLOCK 1024
define gpuErrorCheck(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

global void run(half2 *y, half2 *m, half2 *x, half2 *b, int num)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

for (int i = index; i < num; i += stride)
{
y[i] = m[i]*x[i] + b[i];
}
}

int main ()
{

int num = 1024 * 1024;

half2 *y = NULL;
half2 *m = NULL;
half2 *x = NULL;
half2 *b = NULL;

cudaMallocManaged(&y, sizeof(half2) * num);
cudaMallocManaged(&m, sizeof(half2) * num);
cudaMallocManaged(&x, sizeof(half2) * num);
cudaMallocManaged(&b, sizeof(half2) * num);

for (int i = 0; i < num; i++)
{
y[i] = __float2half2_rn(2.0f);
m[i] = __float2half2_rn(2.0f);
x[i] = __float2half2_rn(2.0f);
b[i] = __float2half2_rn(2.0f);
}

int deviceId;
cudaGetDevice(&deviceId);
cudaMemPrefetchAsync(y, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(m, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(x, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(b, sizeof(half2) * num, deviceId);

gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());

int numberOfBlocks = (num + MAX_NUM_THREADS_PER_BLOCK - 1) / MAX_NUM_THREADS_PER_BLOCK;

run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(y, m, x, b, num);

gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());

run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(y, m, x, b, num);

gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());

return 0;
}

code (coalesced struct):
%%writefile TestProg.cu

include
//include <math.h>
//include <assert.h>
include “cuda_fp16.h”

using namespace std;

define MAX_NUM_THREADS_PER_BLOCK 1024
define gpuErrorCheck(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

struct foo
{

int num;
half2 *y = NULL;
half2 *m = NULL;
half2 *x = NULL;
half2 *b = NULL;

foo (int num) : num(num)
{
cudaMallocManaged(&y, sizeof(half2) * num);
cudaMallocManaged(&m, sizeof(half2) * num);
cudaMallocManaged(&x, sizeof(half2) * num);
cudaMallocManaged(&b, sizeof(half2) * num);

for (int i = 0; i < num; i++)
{
  y[i] = __float2half2_rn(2.0f);
  m[i] = __float2half2_rn(2.0f);
  x[i] = __float2half2_rn(2.0f);
  b[i]  = __float2half2_rn(2.0f);
}

int deviceId;
cudaGetDevice(&deviceId);

cudaMemPrefetchAsync(y, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(m, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(x, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(b, sizeof(half2) * num, deviceId);

}

device inline void bar()
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

for (int i = index; i < num; i += stride)
{
  y[i] = m[i]*x[i] + b[i];
}

}
};

global void run(foo *foos, int num)
{
foos->bar();
}

int main ()
{
int num = 1024 * 1024;

foo *foos = NULL;

cudaMallocManaged(&foos, sizeof(foo));

new (foos) foo(num);

int deviceId;
cudaGetDevice(&deviceId);
cudaMemPrefetchAsync(foos, sizeof(foo), deviceId);

gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());

int numberOfBlocks = (num + MAX_NUM_THREADS_PER_BLOCK - 1) / MAX_NUM_THREADS_PER_BLOCK;

run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(foos, num);

gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());

run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(foos, num);

gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());

return 0;
}

performance:

Compilation command:
!nvcc TestProg.cu -o TestProg

Run command:
!nvprof ./TestProg

Thanks again for the help!!

please format code correctly on these forums. Edit your post with the pencil icon, select the code, click </> button, save your changes. Please do not post pictures of text

Sorry, it looks like it won’t let me update my own post. So I guess I’ll reply to it instead haha :D

Here is the struct option which is slightly slower than the parameters way of doing it.

%%writefile TestProg.cu

#include <iostream>
#include "cuda_fp16.h"

using namespace std;

#define MAX_NUM_THREADS_PER_BLOCK 1024
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

struct foo
{

  int num;
  half2 *y = NULL;
  half2 *m = NULL;
  half2 *x = NULL;
  half2 *b = NULL;

  foo (int num) : num(num)
  {
    cudaMallocManaged(&y, sizeof(half2) * num);
    cudaMallocManaged(&m, sizeof(half2) * num);
    cudaMallocManaged(&x, sizeof(half2) * num);
    cudaMallocManaged(&b, sizeof(half2) * num);

    for (int i = 0; i < num; i++)
    {
      y[i] = __float2half2_rn(2.0f);
      m[i] = __float2half2_rn(2.0f);
      x[i] = __float2half2_rn(2.0f);
      b[i]  = __float2half2_rn(2.0f);
    }

    int deviceId;
    cudaGetDevice(&deviceId);

    cudaMemPrefetchAsync(y, sizeof(half2) * num, deviceId);
    cudaMemPrefetchAsync(m, sizeof(half2) * num, deviceId);
    cudaMemPrefetchAsync(x, sizeof(half2) * num, deviceId);
    cudaMemPrefetchAsync(b, sizeof(half2) * num, deviceId);

  }

  __device__ inline void bar()
  {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = index; i < num; i += stride)
    {
      y[i] = m[i]*x[i] + b[i];
    }
  }
};

__global__ void run(foo *foos, int num)
{
  foos->bar();
}

int main ()
{
  int num = 1024 * 1024;

  foo *foos = NULL;

  cudaMallocManaged(&foos, sizeof(foo));

  new (foos) foo(num);

  int deviceId;
  cudaGetDevice(&deviceId);
  cudaMemPrefetchAsync(foos, sizeof(foo), deviceId);

  gpuErrorCheck(cudaPeekAtLastError());
  gpuErrorCheck(cudaDeviceSynchronize());

  int numberOfBlocks = (num + MAX_NUM_THREADS_PER_BLOCK - 1) / MAX_NUM_THREADS_PER_BLOCK;

  run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(foos, num);

  gpuErrorCheck(cudaPeekAtLastError());
  gpuErrorCheck(cudaDeviceSynchronize());

  run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(foos, num);

  gpuErrorCheck(cudaPeekAtLastError());
  gpuErrorCheck(cudaDeviceSynchronize());

  return 0;
}

Results for struct method:

==2525== NVPROF is profiling process 2525, command: ./TestProg
==2525== Profiling application: ./TestProg
==2525== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  172.73us         2  86.365us  85.374us  87.357us  run(foo*, int)
      API calls:   97.32%  112.84ms         5  22.569ms  12.318us  112.76ms  cudaMallocManaged
                    1.79%  2.0749ms         3  691.63us  86.349us  1.8975ms  cudaDeviceSynchronize
                    0.51%  587.38us         5  117.48us  3.6540us  383.61us  cudaMemPrefetchAsync
                    0.23%  269.55us         2  134.78us  6.8140us  262.74us  cudaLaunchKernel
                    0.12%  140.31us       114  1.2300us     143ns  56.608us  cuDeviceGetAttribute
                    0.01%  11.512us         1  11.512us  11.512us  11.512us  cuDeviceGetName
                    0.01%  8.7580us         2  4.3790us  1.5620us  7.1960us  cudaGetDevice
                    0.00%  5.6930us         1  5.6930us  5.6930us  5.6930us  cuDeviceGetPCIBusId
                    0.00%  4.7510us         1  4.7510us  4.7510us  4.7510us  cuDeviceTotalMem
                    0.00%  1.5600us         3     520ns     200ns  1.1170us  cuDeviceGetCount
                    0.00%  1.3000us         3     433ns     184ns     655ns  cudaPeekAtLastError
                    0.00%     934ns         2     467ns     193ns     741ns  cuDeviceGet
                    0.00%     577ns         1     577ns     577ns     577ns  cuModuleGetLoadingMode
                    0.00%     218ns         1     218ns     218ns     218ns  cuDeviceGetUuid

==2525== Unified Memory profiling result:
Device "Tesla T4 (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
       9  1.7782MB  4.0000KB  2.0000MB  16.00391MB  1.393567ms  Host To Device
Total CPU Page faults: 49

Code for the faster ‘parameters’ method.

%%writefile TestProg.cu

#include <iostream>
#include <math.h>
#include <assert.h>
#include "cuda_fp16.h"


using namespace std;


#define MAX_NUM_THREADS_PER_BLOCK 1024
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void run(half2 *y, half2 *m, half2 *x, half2 *b, int num)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = index; i < num; i += stride)
  {
    y[i] = m[i]*x[i] + b[i];
  }
}


int main ()
{


  int num = 1024 * 1024;

  half2 *y = NULL;
  half2 *m = NULL;
  half2 *x = NULL;
  half2 *b = NULL;

  cudaMallocManaged(&y, sizeof(half2) * num);
  cudaMallocManaged(&m, sizeof(half2) * num);
  cudaMallocManaged(&x, sizeof(half2) * num);
  cudaMallocManaged(&b, sizeof(half2) * num);

  for (int i = 0; i < num; i++)
  {
    y[i] = __float2half2_rn(2.0f);
    m[i] = __float2half2_rn(2.0f);
    x[i] = __float2half2_rn(2.0f);
    b[i]  = __float2half2_rn(2.0f);
  }

  int deviceId;
  cudaGetDevice(&deviceId);
  cudaMemPrefetchAsync(y, sizeof(half2) * num, deviceId);
  cudaMemPrefetchAsync(m, sizeof(half2) * num, deviceId);
  cudaMemPrefetchAsync(x, sizeof(half2) * num, deviceId);
  cudaMemPrefetchAsync(b, sizeof(half2) * num, deviceId);

  gpuErrorCheck(cudaPeekAtLastError());
  gpuErrorCheck(cudaDeviceSynchronize());

  int numberOfBlocks = (num + MAX_NUM_THREADS_PER_BLOCK - 1) / MAX_NUM_THREADS_PER_BLOCK;

  run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(y, m, x, b, num);

  gpuErrorCheck(cudaPeekAtLastError());
  gpuErrorCheck(cudaDeviceSynchronize());

  run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(y, m, x, b, num);

  gpuErrorCheck(cudaPeekAtLastError());
  gpuErrorCheck(cudaDeviceSynchronize());

  return 0;
}

Performance results:

==3162== NVPROF is profiling process 3162, command: ./TestProg
==3162== Profiling application: ./TestProg
==3162== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  147.04us         2  73.518us  72.478us  74.559us  run(__half2*, __half2*, __half2*, __half2*, int)
      API calls:   73.26%  113.91ms         4  28.477ms  11.944us  113.84ms  cudaMallocManaged
                   24.93%  38.767ms         2  19.384ms  6.7890us  38.761ms  cudaLaunchKernel
                    1.29%  2.0100ms         3  669.99us  71.610us  1.8603ms  cudaDeviceSynchronize
                    0.38%  597.47us         4  149.37us  3.9200us  399.73us  cudaMemPrefetchAsync
                    0.11%  167.85us       114  1.4720us     137ns  60.944us  cuDeviceGetAttribute
                    0.01%  11.983us         1  11.983us  11.983us  11.983us  cuDeviceGetName
                    0.00%  6.8000us         1  6.8000us  6.8000us  6.8000us  cudaGetDevice
                    0.00%  5.5540us         1  5.5540us  5.5540us  5.5540us  cuDeviceGetPCIBusId
                    0.00%  4.6420us         1  4.6420us  4.6420us  4.6420us  cuDeviceTotalMem
                    0.00%  1.7600us         3     586ns     194ns  1.2380us  cuDeviceGetCount
                    0.00%  1.6600us         3     553ns     176ns     772ns  cudaPeekAtLastError
                    0.00%     862ns         2     431ns     181ns     681ns  cuDeviceGet
                    0.00%     603ns         1     603ns     603ns     603ns  cuModuleGetLoadingMode
                    0.00%     259ns         1     259ns     259ns     259ns  cuDeviceGetUuid

==3162== Unified Memory profiling result:
Device "Tesla T4 (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
       8  2.0000MB  2.0000MB  2.0000MB  16.00000MB  1.389215ms  Host To Device
Total CPU Page faults: 48

Yeah so the performance is close, but I’m curious as to the actual theoretical reason for any difference.

Thanks again for the help :D

Why don’t you pass the struct by value instead of pointer? With pointer, you have a double indirection to access the half2 arrays.

For other differences, you can inspect the SASS code.

1 Like

Ok cool. Thanks, I’ll give that a try.
What are the pro’s/con’s of pass by value in this context? Will the host still have access to that struct if I wanted to process some data on the host side after the kernel runs?
(Sorry still new to all this haha :D )

Pass by value is passing a copy. So changes on the device won’t be seen on the host. But if the passed struct is from a host variable, this original host variable can still be accessed.

Yeah that was it. By doing pass by value I achieved pretty much identical results to passing all the arrays individually to the kernel.

Thanks!!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.