small bench accessing global mem vs shared

Hi, I am starting cuda programming on a quadro FX 2700M on a hp notebook,
after reading some tutorials and some cuda sdk, i’ve tested the following code (
here is just the important func). The obtained results are puzzling me.
The 3rd approach obviously gives the best results (on my computer at least) but the
high price operation are : 1 load from global and 1 write to global, that is the very same
load/write than the number 1 and 2 approaches.

Any explanation ?
Regards.

RESULTS

= Testing complex conjugate =

= approach1 gives : 97.6781

= approach2 gives : 97.5815

= approach3 gives : 21.838

= approach4 gives : 813.004

CODE
void testConjugate(int argc, char** argv){
// set timers
double time_spent_std = 0.0;
double time_spent_special = 0.0;
double time_spent_special2 = 0.0;
double time_spent_special3 = 0.0;
unsigned int timer_std;
unsigned int timer_special;
unsigned int timer_special2;
unsigned int timer_special3;
cutilCheckError(cutCreateTimer(&timer_std));
cutilCheckError(cutCreateTimer(&timer_special));
cutilCheckError(cutCreateTimer(&timer_special2));
cutilCheckError(cutCreateTimer(&timer_special3));
cutilCheckError(cutResetTimer(timer_std));
cutilCheckError(cutResetTimer(timer_special));
cutilCheckError(cutResetTimer(timer_special2));
cutilCheckError(cutResetTimer(timer_special3));

unsigned int dim = 128;
unsigned int xp_number = 10;
unsigned int lim = 20;

cufftComplex *tmp1_var;

// alloc KERNEL CONJUGATES ON DEVICE
cutilSafeCall(cudaMalloc( (void **) &tmp1_var, dimdimdim*sizeof(cufftComplex)));

for( unsigned int j = 0; j < xp_number; ++j) {

/////////////////////////////////////////////////////
//  method 1
cutStartTimer(timer_std);  
for (unsigned int i = 0; i < lim ; ++i) {
  //compute spectral conjugate
  ComplexConjugate<<<4096,512>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_std);
time_spent_std += cutGetTimerValue(timer_std);
cutResetTimer(timer_std);
cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
//  method 2   
cutStartTimer(timer_special);  
for (unsigned int i = 0; i < lim; ++i) {
  //compute spectral conjugate
  ComplexConjugatev2<<<4096,512>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_special);
time_spent_special += cutGetTimerValue(timer_special);
cutResetTimer(timer_special);

cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
//  method 3
cutStartTimer(timer_special2);  
for (unsigned int i = 0; i < lim; ++i) {
  //compute spectral conjugate
  ComplexConjugatev3<<<4*4096,512/4>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_special2);
time_spent_special2 += cutGetTimerValue(timer_special2);
cutResetTimer(timer_special2);
cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
//  method 4
cutStartTimer(timer_special3);  
for (unsigned int i = 0; i < lim; ++i) {
  //compute spectral conjugate
  ComplexConjugatev4<<<4*4096,128>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_special3);
time_spent_special3 += cutGetTimerValue(timer_special3);
cutResetTimer(timer_special3);
cutilCheckMsg("Something went wrong");

}

std::cout << “=============================” << std::endl;
std::cout << “= Testing complex conjugate =” << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach1 gives : " << time_spent_std/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach2 gives : " << time_spent_special/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach3 gives : " << time_spent_special2/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach4 gives : " << time_spent_special3/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;

cudaFree(tmp1_var);

}

// Complex conjugate
static global void ComplexConjugate(Complex* a)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID].y = - a[threadID].y;
}

// Complex conjugate
static global void ComplexConjugatev2(Complex* a)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID].y *= -1.0;
}

static global void ComplexConjugatev3(Complex* a)
{
const int threadID = threadIdx.x;
shared cufftComplex tab[512];
tab[threadID] = a[blockIdx.x * blockDim.x + threadID];
tab[threadID].y *= - 1.0;
a[blockIdx.x * blockDim.x + threadID] = tab[threadID];
}

static global void ComplexConjugatev4(Complex* a)
{
const int threadID = threadIdx.x;
shared cufftComplex tab[128];
if ( threadID == 0 ){
for( unsigned int i = 0; i < 128; ++i) {
tab[i] = a[blockIdx.x * blockDim.x + i];
}
}
__syncthreads();

tab[threadID].y *= - 1.0;
__syncthreads();

if ( threadID == 0) {
for( unsigned int j = 0; j < 128; ++j) {
a[blockIdx.x * blockDim.x + j] = tab[j];
}
}
}

Hi, I am starting cuda programming on a quadro FX 2700M on a hp notebook,
after reading some tutorials and some cuda sdk, i’ve tested the following code (
here is just the important func). The obtained results are puzzling me.
The 3rd approach obviously gives the best results (on my computer at least) but the
high price operation are : 1 load from global and 1 write to global, that is the very same
load/write than the number 1 and 2 approaches.

Any explanation ?
Regards.

RESULTS

= Testing complex conjugate =

= approach1 gives : 97.6781

= approach2 gives : 97.5815

= approach3 gives : 21.838

= approach4 gives : 813.004

CODE
void testConjugate(int argc, char** argv){
// set timers
double time_spent_std = 0.0;
double time_spent_special = 0.0;
double time_spent_special2 = 0.0;
double time_spent_special3 = 0.0;
unsigned int timer_std;
unsigned int timer_special;
unsigned int timer_special2;
unsigned int timer_special3;
cutilCheckError(cutCreateTimer(&timer_std));
cutilCheckError(cutCreateTimer(&timer_special));
cutilCheckError(cutCreateTimer(&timer_special2));
cutilCheckError(cutCreateTimer(&timer_special3));
cutilCheckError(cutResetTimer(timer_std));
cutilCheckError(cutResetTimer(timer_special));
cutilCheckError(cutResetTimer(timer_special2));
cutilCheckError(cutResetTimer(timer_special3));

unsigned int dim = 128;
unsigned int xp_number = 10;
unsigned int lim = 20;

cufftComplex *tmp1_var;

// alloc KERNEL CONJUGATES ON DEVICE
cutilSafeCall(cudaMalloc( (void **) &tmp1_var, dimdimdim*sizeof(cufftComplex)));

for( unsigned int j = 0; j < xp_number; ++j) {

/////////////////////////////////////////////////////
//  method 1
cutStartTimer(timer_std);  
for (unsigned int i = 0; i < lim ; ++i) {
  //compute spectral conjugate
  ComplexConjugate<<<4096,512>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_std);
time_spent_std += cutGetTimerValue(timer_std);
cutResetTimer(timer_std);
cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
//  method 2   
cutStartTimer(timer_special);  
for (unsigned int i = 0; i < lim; ++i) {
  //compute spectral conjugate
  ComplexConjugatev2<<<4096,512>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_special);
time_spent_special += cutGetTimerValue(timer_special);
cutResetTimer(timer_special);

cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
//  method 3
cutStartTimer(timer_special2);  
for (unsigned int i = 0; i < lim; ++i) {
  //compute spectral conjugate
  ComplexConjugatev3<<<4*4096,512/4>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_special2);
time_spent_special2 += cutGetTimerValue(timer_special2);
cutResetTimer(timer_special2);
cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
//  method 4
cutStartTimer(timer_special3);  
for (unsigned int i = 0; i < lim; ++i) {
  //compute spectral conjugate
  ComplexConjugatev4<<<4*4096,128>>> (tmp1_var);
}

cudaThreadSynchronize();
cutStopTimer(timer_special3);
time_spent_special3 += cutGetTimerValue(timer_special3);
cutResetTimer(timer_special3);
cutilCheckMsg("Something went wrong");

}

std::cout << “=============================” << std::endl;
std::cout << “= Testing complex conjugate =” << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach1 gives : " << time_spent_std/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach2 gives : " << time_spent_special/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach3 gives : " << time_spent_special2/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;
std::cout << "= approach4 gives : " << time_spent_special3/((double) xp_number) << std::endl;
std::cout << “=============================” << std::endl;

cudaFree(tmp1_var);

}

// Complex conjugate
static global void ComplexConjugate(Complex* a)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID].y = - a[threadID].y;
}

// Complex conjugate
static global void ComplexConjugatev2(Complex* a)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID].y *= -1.0;
}

static global void ComplexConjugatev3(Complex* a)
{
const int threadID = threadIdx.x;
shared cufftComplex tab[512];
tab[threadID] = a[blockIdx.x * blockDim.x + threadID];
tab[threadID].y *= - 1.0;
a[blockIdx.x * blockDim.x + threadID] = tab[threadID];
}

static global void ComplexConjugatev4(Complex* a)
{
const int threadID = threadIdx.x;
shared cufftComplex tab[128];
if ( threadID == 0 ){
for( unsigned int i = 0; i < 128; ++i) {
tab[i] = a[blockIdx.x * blockDim.x + i];
}
}
__syncthreads();

tab[threadID].y *= - 1.0;
__syncthreads();

if ( threadID == 0) {
for( unsigned int j = 0; j < 128; ++j) {
a[blockIdx.x * blockDim.x + j] = tab[j];
}
}
}