Hi,
I am relatively new to CUDA so I am most likely missing something here.
I provide below a code that resembles the actual kernel in terms of thread/grid sizes,
as well as program flow. The provided code also exhibits weird behavior, i.e. when typecasting
a float number to float the 100x slowdown is gone. I use clock() from the host side to time the kernel and
I have also verified the slowdown with visual profiler.
#include <cuda.h>
#include <stdio.h>
#define mul 1
__global__
void test(short* in, float* out, int a, int b, int c, int d, int e)
{
int pixel = threadIdx.x + blockIdx.x * blockDim.x;
if(pixel < mul * a * b * d * e)
{
float tmp = 0.0;
int id = 0;
float tmp2;
for(int i = 0; i < c; i++)
{
tmp2 = (float)in[id];
tmp = tmp + 3.32 + tmp2;//SLOW
//tmp = tmp + (float)3.32 + tmp2;//FAST
id += 328;
}
out[pixel] = tmp/(float)c;
}
}
int main()
{
int a = 100;
int b = 10;
int c = 128;
int d = 321;
int e = 161;
clock_t t1, t2;
float* d_out;
float* h_out;
short* d_in;
short* h_in;
h_out = (float*)malloc(mul * a * b * d * e * sizeof(float));
h_in = (short*)malloc(a * b * 1000 * c * sizeof(short));//normally contains data
cudaMalloc(&d_out, mul * a * b * d * e * sizeof(float));
cudaMalloc(&d_in, a * b * 1000 * c * sizeof(short));
cudaMemcpy(d_in, h_in, a * b * 1000 * c * sizeof(short), cudaMemcpyHostToDevice);
dim3 blocks((int)ceil( (float)(mul * a * b * d * e) / (float)1024));
dim3 threads_per_block(1024);
t1 = clock();
test<<<blocks, threads_per_block>>>(d_in, d_out, a, b, c, d, e);
cudaDeviceSynchronize();
t2 = clock();
cudaMemcpy(h_out, d_out, a * b * d * e * sizeof(float), cudaMemcpyDeviceToHost);
//printing some random values to make sure the result is correct
for(int i = 10032; i < 10070; i++)
{
printf("%f\n", h_out[i]);
}
printf("time: %ld\n", t2-t1);
}
Using either of the commented lines within the for loop of the kernel, result in a execution time difference of 100x.
The same slowdown also occurs with several other minor code changes and/or when using the -arch switch to
compile for different architectures.
A search online indicated that this could be a compiler optimization problem.
Can you please provide some hints as to what the problem could be and how to avoid similar problems?
The GPU used is quadro RTX6000.
Thanks!