explanantion on point-wise computation

Hello I’ve posted this in the “General CUDA GPU Computing Discussion” but got no answear,

so I’ll try here instead !

I am starting cuda programming on a quadro FX 2700M on a hp notebook,

after reading some tutorials and some cuda sdk, I’ve tested the following code (

here is just the important func). Basically all it does is to compute the point-wise conjugate of an array of complex.

The obtained results are puzzling me.

The 3rd approach obviously gives the best results (on my computer at least) but the

high price operation are : 1 load from global and 1 write to global, that is the very same

load/write than the number 1 and 2 approaches.

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

The compile information for the ptxas does not seem to better explain things to me :

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_20'

ptxas info    : Used 10 registers, 1024+0 bytes smem, 40 bytes cmem[0], 4 bytes cmem[16]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_20'

ptxas info    : Used 12 registers, 4096+0 bytes smem, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_20'

ptxas info    : Used 6 registers, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_20'

ptxas info    : Used 4 registers, 40 bytes cmem[0]

ptxas /tmp/tmpxft_00007e9f_00000000-2_testAccess.compute_10.ptx, line 99; warning : Double is not supported. Demoting to float

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_10'

ptxas info    : Used 5 registers, 1032+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_10'

ptxas info    : Used 4 registers, 4104+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

Any explanation ?

Regards.

RESULTS

=============================

= Testing complex conjugate =

=============================

= approach1 gives : 97.6781

=============================

= approach2 gives : 97.5815

=============================

= approach3 gives : 21.838

=============================

= approach4 gives : 813.004

=============================

void testConjugate(int argc, char** argv){

// set timers

double time_spent_std = 0.0;

double time_spent_special = 0.0;

double time_spent_special2 = 0.0;

double time_spent_special3 = 0.0;

unsigned int timer_std;

unsigned int timer_special;

unsigned int timer_special2;

unsigned int timer_special3;

cutilCheckError(cutCreateTimer(&timer_std));

cutilCheckError(cutCreateTimer(&timer_special));

cutilCheckError(cutCreateTimer(&timer_special2));

cutilCheckError(cutCreateTimer(&timer_special3));

cutilCheckError(cutResetTimer(timer_std));

cutilCheckError(cutResetTimer(timer_special));

cutilCheckError(cutResetTimer(timer_special2));

cutilCheckError(cutResetTimer(timer_special3));

unsigned int dim = 128;

unsigned int xp_number = 10;

unsigned int lim = 20;

cufftComplex *tmp1_var;

// alloc KERNEL CONJUGATES ON DEVICE

cutilSafeCall(cudaMalloc( (void **) &tmp1_var, dim*dim*dim*sizeof(cufftComplex)));

for( unsigned int j = 0; j < xp_number; ++j) {

/////////////////////////////////////////////////////

// method 1

cutStartTimer(timer_std);

for (unsigned int i = 0; i < lim ; ++i) {

//compute spectral conjugate

ComplexConjugate<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_std);

time_spent_std += cutGetTimerValue(timer_std);

cutResetTimer(timer_std);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 2

cutStartTimer(timer_special);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev2<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special);

time_spent_special += cutGetTimerValue(timer_special);

cutResetTimer(timer_special);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 3

cutStartTimer(timer_special2);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev3<<<4*4096,512/4>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special2);

time_spent_special2 += cutGetTimerValue(timer_special2);

cutResetTimer(timer_special2);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 4

cutStartTimer(timer_special3);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev4<<<4*4096,128>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special3);

time_spent_special3 += cutGetTimerValue(timer_special3);

cutResetTimer(timer_special3);

cutilCheckMsg("Something went wrong");

}

std::cout << "=============================" << std::endl;

std::cout << "= Testing complex conjugate =" << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach1 gives : " << time_spent_std/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach2 gives : " << time_spent_special/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach3 gives : " << time_spent_special2/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach4 gives : " << time_spent_special3/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

cudaFree(tmp1_var);

}

// Complex conjugate

static __global__ void ComplexConjugate(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y = - a[threadID].y;

}

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

static __global__ void ComplexConjugatev4(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[128];

if ( threadID == 0 ){

for( unsigned int i = 0; i < 128; ++i) {

tab[i] = a[blockIdx.x * blockDim.x + i];

}

}

__syncthreads();

tab[threadID].y *= - 1.0;

__syncthreads();

if ( threadID == 0) {

for( unsigned int j = 0; j < 128; ++j) {

a[blockIdx.x * blockDim.x + j] = tab[j];

}

}

}

Hello I’ve posted this in the “General CUDA GPU Computing Discussion” but got no answear,

so I’ll try here instead !

I am starting cuda programming on a quadro FX 2700M on a hp notebook,

after reading some tutorials and some cuda sdk, I’ve tested the following code (

here is just the important func). Basically all it does is to compute the point-wise conjugate of an array of complex.

The obtained results are puzzling me.

The 3rd approach obviously gives the best results (on my computer at least) but the

high price operation are : 1 load from global and 1 write to global, that is the very same

load/write than the number 1 and 2 approaches.

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

The compile information for the ptxas does not seem to better explain things to me :

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_20'

ptxas info    : Used 10 registers, 1024+0 bytes smem, 40 bytes cmem[0], 4 bytes cmem[16]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_20'

ptxas info    : Used 12 registers, 4096+0 bytes smem, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_20'

ptxas info    : Used 6 registers, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_20'

ptxas info    : Used 4 registers, 40 bytes cmem[0]

ptxas /tmp/tmpxft_00007e9f_00000000-2_testAccess.compute_10.ptx, line 99; warning : Double is not supported. Demoting to float

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_10'

ptxas info    : Used 5 registers, 1032+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_10'

ptxas info    : Used 4 registers, 4104+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

Any explanation ?

Regards.

RESULTS

=============================

= Testing complex conjugate =

=============================

= approach1 gives : 97.6781

=============================

= approach2 gives : 97.5815

=============================

= approach3 gives : 21.838

=============================

= approach4 gives : 813.004

=============================

void testConjugate(int argc, char** argv){

// set timers

double time_spent_std = 0.0;

double time_spent_special = 0.0;

double time_spent_special2 = 0.0;

double time_spent_special3 = 0.0;

unsigned int timer_std;

unsigned int timer_special;

unsigned int timer_special2;

unsigned int timer_special3;

cutilCheckError(cutCreateTimer(&timer_std));

cutilCheckError(cutCreateTimer(&timer_special));

cutilCheckError(cutCreateTimer(&timer_special2));

cutilCheckError(cutCreateTimer(&timer_special3));

cutilCheckError(cutResetTimer(timer_std));

cutilCheckError(cutResetTimer(timer_special));

cutilCheckError(cutResetTimer(timer_special2));

cutilCheckError(cutResetTimer(timer_special3));

unsigned int dim = 128;

unsigned int xp_number = 10;

unsigned int lim = 20;

cufftComplex *tmp1_var;

// alloc KERNEL CONJUGATES ON DEVICE

cutilSafeCall(cudaMalloc( (void **) &tmp1_var, dim*dim*dim*sizeof(cufftComplex)));

for( unsigned int j = 0; j < xp_number; ++j) {

/////////////////////////////////////////////////////

// method 1

cutStartTimer(timer_std);

for (unsigned int i = 0; i < lim ; ++i) {

//compute spectral conjugate

ComplexConjugate<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_std);

time_spent_std += cutGetTimerValue(timer_std);

cutResetTimer(timer_std);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 2

cutStartTimer(timer_special);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev2<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special);

time_spent_special += cutGetTimerValue(timer_special);

cutResetTimer(timer_special);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 3

cutStartTimer(timer_special2);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev3<<<4*4096,512/4>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special2);

time_spent_special2 += cutGetTimerValue(timer_special2);

cutResetTimer(timer_special2);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 4

cutStartTimer(timer_special3);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev4<<<4*4096,128>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special3);

time_spent_special3 += cutGetTimerValue(timer_special3);

cutResetTimer(timer_special3);

cutilCheckMsg("Something went wrong");

}

std::cout << "=============================" << std::endl;

std::cout << "= Testing complex conjugate =" << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach1 gives : " << time_spent_std/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach2 gives : " << time_spent_special/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach3 gives : " << time_spent_special2/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach4 gives : " << time_spent_special3/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

cudaFree(tmp1_var);

}

// Complex conjugate

static __global__ void ComplexConjugate(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y = - a[threadID].y;

}

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

static __global__ void ComplexConjugatev4(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[128];

if ( threadID == 0 ){

for( unsigned int i = 0; i < 128; ++i) {

tab[i] = a[blockIdx.x * blockDim.x + i];

}

}

__syncthreads();

tab[threadID].y *= - 1.0;

__syncthreads();

if ( threadID == 0) {

for( unsigned int j = 0; j < 128; ++j) {

a[blockIdx.x * blockDim.x + j] = tab[j];

}

}

}

Why are you loading the entire shared memory from just one thread? And since ur doing that, why are you doing syncthreads?

The reason kernel4 is faster is because you are using shared memory… Try this piece of code instead:

It should be quite a bit faster…

BTW kernel 4 was not the fastest !!

Well the fastest approach was number 3, that is the one where each thread put 1 complex in a shared array of complex, do the conjugate thing and put the computed value back to the global memory.

Since my hardware (quadro 2700M) is a 1.1 type, it seems (I remember that is written somewhere) that the “coalesced” thing does not happen properly, as a consequence there is some optimisation

that are so much dependent on the code that it cannot be guessed…

Anyway thanks for your advice.

Regards.K