I need help to optimize the speed for my kernel function

I need help to optimize the speed for my kernel function. At the moment I use a GTX570 with 480 single precision cuda cores and 60 double precision cuda cores. At the moment my Kernel use only the global memory. But in the future I want to use the shared memory for optimizing the kernel speed.

My kernel function multiplies each row of the 2D input array with 1D window array. In the future I want to save the 1D window coefficient array in the shared memory, but I do not know how to do this. My window coefficient array consist of maximal 4096 complex double precision values. At the moment I use the cuda 4.2 compiler.

global void Kernel_FFT_Window_Double_Complex_Precision(complex_double *Input_Data, complex_double *Result, double *Window_Coefficient, int Number_Of_Columns, int Number_Of_Rows)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;

if( tidx >= Number_Of_Columns )
{
	return;
}

if( tidy >= Number_Of_Rows )
{
	return;
}

Result[tidy * Number_Of_Columns + tidx].x = Input_Data[tidy * Number_Of_Columns + tidx].x * Window_Coefficient[tidx];
Result[tidy * Number_Of_Columns + tidx].y = Input_Data[tidy * Number_Of_Columns + tidx].y * Window_Coefficient[tidx];

__syncthreads();

}

extern “C” __declspec(dllexport) void GPU_FFT_Window_Double_Complex_Precision(complex_double *Input_Data, complex_double *Result, double *Window_Coefficient, int Number_Of_Columns, int Number_Of_Rows)
{
dim3 dimGrid;
dim3 dimBlock;

dimBlock.x = 32;
dimBlock.y = 1;

dimGrid.x =Number_Of_Columns / 32;
dimGrid.y = Number_Of_Rows;

Kernel_FFT_Window_Double_Complex_Precision<<<dimGrid, dimBlock>>>(Input_Data, Result, Window_Coefficient, Number_Of_Columns, Number_Of_Rows); 	

}

I have modify my kernel code, but now my kernel crash :-)

int tidx = blockIdx.x * blockDim.x + threadIdx.x;

int tidy = blockIdx.y * blockDim.y + threadIdx.y;

[b]extern shared double s_data;

s_data[tidx] = Window_Coefficient[tidx];

__syncthreads();[/b]

if( tidx >= Number_Of_Columns )

{

return;

}

if( tidy >= Number_Of_Rows )

{

return;

}

Result[tidy * Number_Of_Columns + tidx].x = Input_Data[tidy * Number_Of_Columns + tidx].x * s_data[tidx];

Result[tidy * Number_Of_Columns + tidx].y = Input_Data[tidy * Number_Of_Columns + tidx].y * s_data[tidx];

__syncthreads();

}

Here is my new code, but now I get false data results

global void Kernel_FFT_Window_Double_Complex_Precision(complex_double *Input_Data, complex_double *Result, double *Window_Coefficient, int Number_Of_Columns, int Number_Of_Rows)

{

int tidx = blockIdx.x * blockDim.x + threadIdx.x;

int tidy = blockIdx.y * blockDim.y + threadIdx.y;

int test = threadIdx.x;

[b] // Copy the Data from global memory to shared memory

extern shared double s_data;

s_data[test] = Window_Coefficient[test];

__syncthreads();[/b]

if( tidx >= Number_Of_Columns )

{

return;

}

if( tidy >= Number_Of_Rows )

{

return;

}

Result[tidy * Number_Of_Columns + tidx].x = Input_Data[tidy * Number_Of_Columns + tidx].x * s_data[test];

Result[tidy * Number_Of_Columns + tidx].y = Input_Data[tidy * Number_Of_Columns + tidx].y * s_data[test];

__syncthreads();

}

int numThreadsPerBlock = 128;

int sharedMemSize = numThreadsPerBlock * sizeof(double);

dim3 dimGrid;

dim3 dimBlock;

dimBlock.x = 128;

dimBlock.y = 1;

dimGrid.x = Number_Of_Columns / 128;

dimGrid.y = Number_Of_Rows;

Kernel_FFT_Window_Double_Complex_Precision<<<dimGrid, dimBlock, sharedMemSize>>>(Input_Data, Result, Window_Coefficient, Number_Of_Columns, Number_Of_Rows);

Pls help me :-)