I am trying to build kernel function and measure the performance.
This code is operating well, but I want to optimize it to get higher speed.
I’m using Titan X.
Is there better way to implement kernel function?
#define SIGNAL_SIZE 1024*1024
#define TILE_WIDTH 512
__global__ void point_wise_product(cufftComplex *a, int *b){
const int Row = blockIdx.y*TILE_WIDTH + threadIdx.y;
const int Col = blockIdx.x*TILE_WIDTH + threadIdx.x;
const int numThreads = SIGNAL_SIZE*SIGNAL_SIZE;
if(Row < SIGNAL_SIZE && Col < SIGNAL_SIZE){
a[Row*TILE_WIDTH + Col].x = a[Row*TILE_WIDTH + Col].x * b[Row*TILE_WIDTH + Col];
a[Row*TILE_WIDTH + Col].y = a[Row*TILE_WIDTH + Col].y * b[Row*TILE_WIDTH + Col];
}
}
int main()
{
...........
dim3 dimGrids((SIGNAL_SIZE-1)/TILE_WIDTH + 1, (SIGNAL_SIZE-1)/TILE_WIDTH + 1, 1);
dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH, 1);
}