Hi,
I took again a 2 years old app wich had some kernel “optimized” using shared memory on a GF 8800 GTX. At the time the kernels were written, we got 13x speedup from naive to shared. With the GT200 (Tesla C1060) we still got 3.3x speedup, but I just realized today that on a Fermi board (Tesla C2060), thanks to the cache, the naive kernel is really faster than the “optimized” one !
Did anybody had similar results on some apps ? Are we going toward the end of shared memory ?
And finally do we have to write more than one kernel and dynamically switch at execution with a GPU detection ?
Here are the results I get :
GeForce 8800 GTX :
Time naive 5.60ms
Time shared 0.41ms
Speedup is 13.65
C1060 :
Time naive 1.38ms
Time shared 0.42ms
Speedup is 3.32
C2060 :
Time naive 0.12ms
Time shared 0.20ms
Speedup is 0.61
And the sample test code that produced the results :
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <sys/time.h>
#define NP 128
#define LBOX 6.f
#define DX ((float)LBOX/NP)
#define ITERATIONS 10
//========================================================================
__global__ void k_shared( float *pot, float *fx )
{
int tx=threadIdx.x;
int bx=blockIdx.x;
int by=blockIdx.y;
__shared__ float spot[NP];
int blockCoord = bx * NP * NP + by*NP;
int cellCoord = blockCoord + tx;
spot[tx]=pot[cellCoord];
__syncthreads();
float x1 = spot[ ((unsigned int)(tx - 1 )) & (NP-1) ];
float x2 = spot[ (tx + 1 ) & (NP-1) ];
fx[cellCoord]=(x2-x1)/(2.f*DX);
}
__global__ void k_naive( float *pot, float *fx )
{
int tx=threadIdx.x;
int bx=blockIdx.x;
int by=blockIdx.y;
int cellCoord = bx * NP * NP + by*NP + tx;
fx[cellCoord]=(pot[(cellCoord+1) & (NP - 1)]-pot[(cellCoord-1) & (NP - 1)])/(2.f*DX);
}
int main(int argc, char **argv) {
float *pot,*fx;
float elapsedTime_naive = -1;
float elapsedTime_shared = -1;
cudaEvent_t start_event, stop_event;
dim3 dimGridForce(NP,NP);
dim3 dimBlockForce(NP);
/* Alloc */
cudaMalloc((void**)&pot,sizeof(float)*NP*NP*NP);
cudaMalloc((void**)&fx,sizeof(float)*NP*NP*NP);
/* Prepare event for timing */
cudaEventCreate(&start_event);
cudaEventCreate(&stop_event);
/*********************
* NAIVE
*/
// Warm-up
k_naive<<<dimGridForce,dimBlockForce>>>(pot,fx);
// Bench
cudaEventRecord(start_event, 0);
for(int i=0; i<ITERATIONS; i++) {
k_naive<<<dimGridForce,dimBlockForce>>>(pot,fx);
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsedTime_naive, start_event, stop_event);
/*********************
* SHARED
*/
// Warm-up
k_shared<<<dimGridForce,dimBlockForce>>>(pot,fx);
// Bench
cudaEventRecord(start_event, 0);
for(int i=0; i<ITERATIONS; i++) {
k_shared<<<dimGridForce,dimBlockForce>>>(pot,fx);
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsedTime_shared, start_event, stop_event);
printf("Time naive %0.2fms\nTime shared %0.2fms\nSpeedup is %0.2f\n",
elapsedTime_naive/ITERATIONS,elapsedTime_shared/ITERATIONS,elapsedTime_naive/elapsedTime_shared);
}