I need help to optimize the speed for my kernel function. At the moment I use a GTX570 with 480 single precision cuda cores and 60 double precision cuda cores. At the moment my Kernel use only the global memory. But in the future I want to use the shared memory for optimizing the kernel speed.
My kernel function multiplies each row of the 2D input array with 1D window array. In the future I want to save the 1D window coefficient array in the shared memory, but I do not know how to do this. My window coefficient array consist of maximal 4096 complex double precision values. At the moment I use the cuda 4.2 compiler.
global void Kernel_FFT_Window_Double_Complex_Precision(complex_double *Input_Data, complex_double *Result, double *Window_Coefficient, int Number_Of_Columns, int Number_Of_Rows)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if( tidx >= Number_Of_Columns )
{
return;
}
if( tidy >= Number_Of_Rows )
{
return;
}
Result[tidy * Number_Of_Columns + tidx].x = Input_Data[tidy * Number_Of_Columns + tidx].x * Window_Coefficient[tidx];
Result[tidy * Number_Of_Columns + tidx].y = Input_Data[tidy * Number_Of_Columns + tidx].y * Window_Coefficient[tidx];
__syncthreads();
}
extern “C” __declspec(dllexport) void GPU_FFT_Window_Double_Complex_Precision(complex_double *Input_Data, complex_double *Result, double *Window_Coefficient, int Number_Of_Columns, int Number_Of_Rows)
{
dim3 dimGrid;
dim3 dimBlock;
dimBlock.x = 32;
dimBlock.y = 1;
dimGrid.x =Number_Of_Columns / 32;
dimGrid.y = Number_Of_Rows;
Kernel_FFT_Window_Double_Complex_Precision<<<dimGrid, dimBlock>>>(Input_Data, Result, Window_Coefficient, Number_Of_Columns, Number_Of_Rows);
}