Good Morning,
I am a noob when the subject is CUDA.
I have a little question about CUDA optimization using shared memory.
I program two kernels, the first using global memory and second using shared memory, but the first has a major speed up that is compare with the second kernel.
This is happen because i am transfer the data between global and shared memory whithout before allocate the same data into the registers?
following in the next lines the first and second kernels code.
somebody help me?
/First kernel/
global void calc_ez(float *ez,float *ga, float *dz, int dimx, int dimy){
int ix = blockIdx.x*blockDim.x + threadIdx.x;
int iy = blockIdx.y*blockDim.y + threadIdx.y;
int idx = iy*dimx + ix;
ez[idx]=ga[idx]*dz[idx];
__syncthreads();
}
/Second kernel/
/*The SUB_LARGURA is the amount of shared memory using in the kernel and TAM is the amount of full space allocate on the global memory */
global void calc_ez_share(float *ez,float *ga, float *dz, int dimx, int dimy){
__shared__ float ga_s[SUB_LARGURA][SUB_LARGURA];
__shared__ float dz_s[SUB_LARGURA][SUB_LARGURA];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int ix = blockIdx.x*blockDim.x + threadIdx.x;
int iy = blockIdx.y*blockDim.y + threadIdx.y;
int idx = iy*dimx + ix;
int Row = by * SUB_LARGURA + ty;
int Col = bx * SUB_LARGURA + tx;
for (int m = 0; m <TAM/SUB_LARGURA; m++) {
ga_s[ty][tx] = ga[Row*TAM + (m*SUB_LARGURA + tx)];
dz_s[ty][tx] = dz[Row*TAM + (m*SUB_LARGURA + tx)];
__syncthreads();
ez[Row*TAM + (m*SUB_LARGURA + tx)]=(ga_s[ty][tx] * dz_s[ty][tx]);
__syncthreads();
}
__syncthreads();
}
Best Regards
Antonio Carlos