Hello,
I’m a newbie in CUDA programing, now I’m writing a function and trying to optimize it, the codes of the function are:
device constant int cd_FractionalDelayFilterLength[1];
device constant int cd_FractionalDelayIndex[PATHNUM];
device constatn int cd_FadedSignalLength[1];
global void MyKernel(float2 *x, float2 *y, float *FractionalDelayFilter)
{
shared float Filter[FDFILTERN];
shared float xx[BLOCK_SIZE1 + WARPSIZE];
shared float xy[BLOCK_SIZE1 + WARPSIZE];
int tid = threadIdx.x;
int id = BLOCK_SIZE1 * blockIdx.x + tid;
int xid;
float y1, y2;
if (tid < FDFILTERN)
{
Filter[tid] = FractionalDelayFilter[cd_FractionalDelayFilterLength[0] * cd_FractionalDelayIndex[id / cd_FadedSignalLength[0]] + tid];
}
xx[tid] = x[id].x;
xy[tid] = x[id].y;
xid = id + BLOCK_SIZE1;
if (tid < WARPSIZE && xid < cd_TotalFadedSignalLength[0])
{
xx[tid + BLOCK_SIZE1] = x[xid].x;
xy[tid + BLOCK_SIZE1] = x[xid].y;
}
__syncthreads();
y1 = y2 = 0;
for (int l = 0; l < FDFILTERN; l++, xid = tid + l)
{
y1 += xx[tid + l] * Filter[l];
y2 += xy[tid + l] * Filter[l];
}
y[id].x = y1;
y[id].y = y2;
}
the run time of it is about 1.9ms.
I found that the last 2 raws of code
y[id].x = y1;
y[id].y = y2;
consume about 0.9ms.
At first I think this should be because the accessing latency of the global memory, but when I modified these 2 raws to
y[id].x = 1;
y[id].y = 2;
the total runtime is only about 1ms!
Then I think this maybe due to the register dependencies as mentioned in “CUDA Best Practices Guide” section 4.3, but when I modified the codes to
float x1 = 1;
float x2 = 2;
y[id].x = x1;
y[id].y = x2;
the total run time is still 1ms
Only if I copy the register of “y1” or “y2” to another register or global memory, like
float x1 = y1;
float x2 = y2;
y[id].x = x1;
y[id].y = x2;
the total rum time will raise to 1.9ms again.
So is there anyone can help to explain this? And is there any method to optimize my codes?
thank you very much!
Best regards
Zhu Wen