Hello. I’m pretty new to Cuda, and I’ve got a question about a performance difference I’m seeing.
When I pass in all my data as a bunch of arrays as parameters to my kernel function, I’m seeing the best performance.
But, when I passed it in as an array of structs, I saw worse performance. But then I learned about coalescing, so then I refactored my struct to contain a bunch of arrays and passed in a single struct that contained a bunch of arrays and that greatly improved performance.
But I’m still seeing a difference in performance between the coalesced struct parameters method, and the bunch-of-arrays parameter method that I don’t understand.
Do you folks know why passing in a bunch of parameters would still result in better performance than passing in a struct that contain a bunch of coalesced data?
This was done in google colab using a T4 GPU
code (bunch-of-parameters):
%%writefile TestProg.cu
include
include <math.h>
include <assert.h>
include “cuda_fp16.h”
using namespace std;
define MAX_NUM_THREADS_PER_BLOCK 1024
define gpuErrorCheck(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
global void run(half2 *y, half2 *m, half2 *x, half2 *b, int num)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < num; i += stride)
{
y[i] = m[i]*x[i] + b[i];
}
}
int main ()
{
int num = 1024 * 1024;
half2 *y = NULL;
half2 *m = NULL;
half2 *x = NULL;
half2 *b = NULL;
cudaMallocManaged(&y, sizeof(half2) * num);
cudaMallocManaged(&m, sizeof(half2) * num);
cudaMallocManaged(&x, sizeof(half2) * num);
cudaMallocManaged(&b, sizeof(half2) * num);
for (int i = 0; i < num; i++)
{
y[i] = __float2half2_rn(2.0f);
m[i] = __float2half2_rn(2.0f);
x[i] = __float2half2_rn(2.0f);
b[i] = __float2half2_rn(2.0f);
}
int deviceId;
cudaGetDevice(&deviceId);
cudaMemPrefetchAsync(y, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(m, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(x, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(b, sizeof(half2) * num, deviceId);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
int numberOfBlocks = (num + MAX_NUM_THREADS_PER_BLOCK - 1) / MAX_NUM_THREADS_PER_BLOCK;
run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(y, m, x, b, num);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(y, m, x, b, num);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
return 0;
}
code (coalesced struct):
%%writefile TestProg.cu
include
//include <math.h>
//include <assert.h>
include “cuda_fp16.h”
using namespace std;
define MAX_NUM_THREADS_PER_BLOCK 1024
define gpuErrorCheck(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct foo
{
int num;
half2 *y = NULL;
half2 *m = NULL;
half2 *x = NULL;
half2 *b = NULL;
foo (int num) : num(num)
{
cudaMallocManaged(&y, sizeof(half2) * num);
cudaMallocManaged(&m, sizeof(half2) * num);
cudaMallocManaged(&x, sizeof(half2) * num);
cudaMallocManaged(&b, sizeof(half2) * num);
for (int i = 0; i < num; i++)
{
y[i] = __float2half2_rn(2.0f);
m[i] = __float2half2_rn(2.0f);
x[i] = __float2half2_rn(2.0f);
b[i] = __float2half2_rn(2.0f);
}
int deviceId;
cudaGetDevice(&deviceId);
cudaMemPrefetchAsync(y, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(m, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(x, sizeof(half2) * num, deviceId);
cudaMemPrefetchAsync(b, sizeof(half2) * num, deviceId);
}
device inline void bar()
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < num; i += stride)
{
y[i] = m[i]*x[i] + b[i];
}
}
};
global void run(foo *foos, int num)
{
foos->bar();
}
int main ()
{
int num = 1024 * 1024;
foo *foos = NULL;
cudaMallocManaged(&foos, sizeof(foo));
new (foos) foo(num);
int deviceId;
cudaGetDevice(&deviceId);
cudaMemPrefetchAsync(foos, sizeof(foo), deviceId);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
int numberOfBlocks = (num + MAX_NUM_THREADS_PER_BLOCK - 1) / MAX_NUM_THREADS_PER_BLOCK;
run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(foos, num);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
run<<<numberOfBlocks, MAX_NUM_THREADS_PER_BLOCK>>>(foos, num);
gpuErrorCheck(cudaPeekAtLastError());
gpuErrorCheck(cudaDeviceSynchronize());
return 0;
}
performance:
Compilation command:
!nvcc TestProg.cu -o TestProg
Run command:
!nvprof ./TestProg
Thanks again for the help!!