Hi,

I took again a 2 years old app wich had some kernel “optimized” using shared memory on a GF 8800 GTX. At the time the kernels were written, we got 13x speedup from naive to shared. With the GT200 (Tesla C1060) we still got 3.3x speedup, but I just realized today that on a Fermi board (Tesla C2060), thanks to the cache, the naive kernel is really faster than the “optimized” one !

Did anybody had similar results on some apps ? Are we going toward the end of shared memory ?

And finally do we have to write more than one kernel and dynamically switch at execution with a GPU detection ?

Here are the results I get :

GeForce 8800 GTX :

Time naive 5.60ms

Time shared 0.41ms

Speedup is 13.65

C1060 :

Time naive 1.38ms

Time shared 0.42ms

Speedup is 3.32

C2060 :

Time naive 0.12ms

Time shared 0.20ms

Speedup is 0.61

And the sample test code that produced the results :

```
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <sys/time.h>
#define NP 128
#define LBOX 6.f
#define DX ((float)LBOX/NP)
#define ITERATIONS 10
//========================================================================
__global__ void k_shared( float *pot, float *fx )
{
int tx=threadIdx.x;
int bx=blockIdx.x;
int by=blockIdx.y;
__shared__ float spot[NP];
int blockCoord = bx * NP * NP + by*NP;
int cellCoord = blockCoord + tx;
spot[tx]=pot[cellCoord];
__syncthreads();
float x1 = spot[ ((unsigned int)(tx - 1 )) & (NP-1) ];
float x2 = spot[ (tx + 1 ) & (NP-1) ];
fx[cellCoord]=(x2-x1)/(2.f*DX);
}
__global__ void k_naive( float *pot, float *fx )
{
int tx=threadIdx.x;
int bx=blockIdx.x;
int by=blockIdx.y;
int cellCoord = bx * NP * NP + by*NP + tx;
fx[cellCoord]=(pot[(cellCoord+1) & (NP - 1)]-pot[(cellCoord-1) & (NP - 1)])/(2.f*DX);
}
int main(int argc, char **argv) {
float *pot,*fx;
float elapsedTime_naive = -1;
float elapsedTime_shared = -1;
cudaEvent_t start_event, stop_event;
dim3 dimGridForce(NP,NP);
dim3 dimBlockForce(NP);
/* Alloc */
cudaMalloc((void**)&pot,sizeof(float)*NP*NP*NP);
cudaMalloc((void**)&fx,sizeof(float)*NP*NP*NP);
/* Prepare event for timing */
cudaEventCreate(&start_event);
cudaEventCreate(&stop_event);
/*********************
* NAIVE
*/
// Warm-up
k_naive<<<dimGridForce,dimBlockForce>>>(pot,fx);
// Bench
cudaEventRecord(start_event, 0);
for(int i=0; i<ITERATIONS; i++) {
k_naive<<<dimGridForce,dimBlockForce>>>(pot,fx);
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsedTime_naive, start_event, stop_event);
/*********************
* SHARED
*/
// Warm-up
k_shared<<<dimGridForce,dimBlockForce>>>(pot,fx);
// Bench
cudaEventRecord(start_event, 0);
for(int i=0; i<ITERATIONS; i++) {
k_shared<<<dimGridForce,dimBlockForce>>>(pot,fx);
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsedTime_shared, start_event, stop_event);
printf("Time naive %0.2fms\nTime shared %0.2fms\nSpeedup is %0.2f\n",
elapsedTime_naive/ITERATIONS,elapsedTime_shared/ITERATIONS,elapsedTime_naive/elapsedTime_shared);
}
```