explanantion on point-wise computation

Mitsu_DunDee · November 25, 2010, 9:32am

Hello I’ve posted this in the “General CUDA GPU Computing Discussion” but got no answear,

so I’ll try here instead !

I am starting cuda programming on a quadro FX 2700M on a hp notebook,

after reading some tutorials and some cuda sdk, I’ve tested the following code (

here is just the important func). Basically all it does is to compute the point-wise conjugate of an array of complex.

The obtained results are puzzling me.

The 3rd approach obviously gives the best results (on my computer at least) but the

high price operation are : 1 load from global and 1 write to global, that is the very same

load/write than the number 1 and 2 approaches.

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

The compile information for the ptxas does not seem to better explain things to me :

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_20'

ptxas info    : Used 10 registers, 1024+0 bytes smem, 40 bytes cmem[0], 4 bytes cmem[16]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_20'

ptxas info    : Used 12 registers, 4096+0 bytes smem, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_20'

ptxas info    : Used 6 registers, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_20'

ptxas info    : Used 4 registers, 40 bytes cmem[0]

ptxas /tmp/tmpxft_00007e9f_00000000-2_testAccess.compute_10.ptx, line 99; warning : Double is not supported. Demoting to float

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_10'

ptxas info    : Used 5 registers, 1032+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_10'

ptxas info    : Used 4 registers, 4104+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

Any explanation ?

Regards.

RESULTS

=============================

= Testing complex conjugate =

=============================

= approach1 gives : 97.6781

=============================

= approach2 gives : 97.5815

=============================

= approach3 gives : 21.838

=============================

= approach4 gives : 813.004

=============================

void testConjugate(int argc, char** argv){

// set timers

double time_spent_std = 0.0;

double time_spent_special = 0.0;

double time_spent_special2 = 0.0;

double time_spent_special3 = 0.0;

unsigned int timer_std;

unsigned int timer_special;

unsigned int timer_special2;

unsigned int timer_special3;

cutilCheckError(cutCreateTimer(&timer_std));

cutilCheckError(cutCreateTimer(&timer_special));

cutilCheckError(cutCreateTimer(&timer_special2));

cutilCheckError(cutCreateTimer(&timer_special3));

cutilCheckError(cutResetTimer(timer_std));

cutilCheckError(cutResetTimer(timer_special));

cutilCheckError(cutResetTimer(timer_special2));

cutilCheckError(cutResetTimer(timer_special3));

unsigned int dim = 128;

unsigned int xp_number = 10;

unsigned int lim = 20;

cufftComplex *tmp1_var;

// alloc KERNEL CONJUGATES ON DEVICE

cutilSafeCall(cudaMalloc( (void **) &tmp1_var, dim*dim*dim*sizeof(cufftComplex)));

for( unsigned int j = 0; j < xp_number; ++j) {

/////////////////////////////////////////////////////

// method 1

cutStartTimer(timer_std);

for (unsigned int i = 0; i < lim ; ++i) {

//compute spectral conjugate

ComplexConjugate<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_std);

time_spent_std += cutGetTimerValue(timer_std);

cutResetTimer(timer_std);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 2

cutStartTimer(timer_special);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev2<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special);

time_spent_special += cutGetTimerValue(timer_special);

cutResetTimer(timer_special);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 3

cutStartTimer(timer_special2);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev3<<<4*4096,512/4>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special2);

time_spent_special2 += cutGetTimerValue(timer_special2);

cutResetTimer(timer_special2);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 4

cutStartTimer(timer_special3);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev4<<<4*4096,128>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special3);

time_spent_special3 += cutGetTimerValue(timer_special3);

cutResetTimer(timer_special3);

cutilCheckMsg("Something went wrong");

}

std::cout << "=============================" << std::endl;

std::cout << "= Testing complex conjugate =" << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach1 gives : " << time_spent_std/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach2 gives : " << time_spent_special/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach3 gives : " << time_spent_special2/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach4 gives : " << time_spent_special3/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

cudaFree(tmp1_var);

}

// Complex conjugate

static __global__ void ComplexConjugate(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y = - a[threadID].y;

}

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

static __global__ void ComplexConjugatev4(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[128];

if ( threadID == 0 ){

for( unsigned int i = 0; i < 128; ++i) {

tab[i] = a[blockIdx.x * blockDim.x + i];

}

}

__syncthreads();

tab[threadID].y *= - 1.0;

__syncthreads();

if ( threadID == 0) {

for( unsigned int j = 0; j < 128; ++j) {

a[blockIdx.x * blockDim.x + j] = tab[j];

}

}

}

Mitsu_DunDee · November 25, 2010, 9:32am

Hello I’ve posted this in the “General CUDA GPU Computing Discussion” but got no answear,

so I’ll try here instead !

I am starting cuda programming on a quadro FX 2700M on a hp notebook,

after reading some tutorials and some cuda sdk, I’ve tested the following code (

here is just the important func). Basically all it does is to compute the point-wise conjugate of an array of complex.

The obtained results are puzzling me.

The 3rd approach obviously gives the best results (on my computer at least) but the

high price operation are : 1 load from global and 1 write to global, that is the very same

load/write than the number 1 and 2 approaches.

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

The compile information for the ptxas does not seem to better explain things to me :

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_20'

ptxas info    : Used 10 registers, 1024+0 bytes smem, 40 bytes cmem[0], 4 bytes cmem[16]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_20'

ptxas info    : Used 12 registers, 4096+0 bytes smem, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_20'

ptxas info    : Used 6 registers, 40 bytes cmem[0]

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_20'

ptxas info    : Used 4 registers, 40 bytes cmem[0]

ptxas /tmp/tmpxft_00007e9f_00000000-2_testAccess.compute_10.ptx, line 99; warning : Double is not supported. Demoting to float

ptxas info    : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_10'

ptxas info    : Used 5 registers, 1032+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_10'

ptxas info    : Used 4 registers, 4104+16 bytes smem

ptxas info    : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

ptxas info    : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_10'

ptxas info    : Used 2 registers, 8+16 bytes smem

Any explanation ?

Regards.

RESULTS

=============================

= Testing complex conjugate =

=============================

= approach1 gives : 97.6781

=============================

= approach2 gives : 97.5815

=============================

= approach3 gives : 21.838

=============================

= approach4 gives : 813.004

=============================

void testConjugate(int argc, char** argv){

// set timers

double time_spent_std = 0.0;

double time_spent_special = 0.0;

double time_spent_special2 = 0.0;

double time_spent_special3 = 0.0;

unsigned int timer_std;

unsigned int timer_special;

unsigned int timer_special2;

unsigned int timer_special3;

cutilCheckError(cutCreateTimer(&timer_std));

cutilCheckError(cutCreateTimer(&timer_special));

cutilCheckError(cutCreateTimer(&timer_special2));

cutilCheckError(cutCreateTimer(&timer_special3));

cutilCheckError(cutResetTimer(timer_std));

cutilCheckError(cutResetTimer(timer_special));

cutilCheckError(cutResetTimer(timer_special2));

cutilCheckError(cutResetTimer(timer_special3));

unsigned int dim = 128;

unsigned int xp_number = 10;

unsigned int lim = 20;

cufftComplex *tmp1_var;

// alloc KERNEL CONJUGATES ON DEVICE

cutilSafeCall(cudaMalloc( (void **) &tmp1_var, dim*dim*dim*sizeof(cufftComplex)));

for( unsigned int j = 0; j < xp_number; ++j) {

/////////////////////////////////////////////////////

// method 1

cutStartTimer(timer_std);

for (unsigned int i = 0; i < lim ; ++i) {

//compute spectral conjugate

ComplexConjugate<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_std);

time_spent_std += cutGetTimerValue(timer_std);

cutResetTimer(timer_std);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 2

cutStartTimer(timer_special);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev2<<<4096,512>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special);

time_spent_special += cutGetTimerValue(timer_special);

cutResetTimer(timer_special);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 3

cutStartTimer(timer_special2);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev3<<<4*4096,512/4>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special2);

time_spent_special2 += cutGetTimerValue(timer_special2);

cutResetTimer(timer_special2);

cutilCheckMsg("Something went wrong");

/////////////////////////////////////////////////////

// method 4

cutStartTimer(timer_special3);

for (unsigned int i = 0; i < lim; ++i) {

//compute spectral conjugate

ComplexConjugatev4<<<4*4096,128>>> (tmp1_var);

}

cudaThreadSynchronize();

cutStopTimer(timer_special3);

time_spent_special3 += cutGetTimerValue(timer_special3);

cutResetTimer(timer_special3);

cutilCheckMsg("Something went wrong");

}

std::cout << "=============================" << std::endl;

std::cout << "= Testing complex conjugate =" << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach1 gives : " << time_spent_std/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach2 gives : " << time_spent_special/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach3 gives : " << time_spent_special2/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

std::cout << "= approach4 gives : " << time_spent_special3/((double) xp_number) << std::endl;

std::cout << "=============================" << std::endl;

cudaFree(tmp1_var);

}

// Complex conjugate

static __global__ void ComplexConjugate(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y = - a[threadID].y;

}

// Complex conjugate

static __global__ void ComplexConjugatev2(Complex* a)

{

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

a[threadID].y *= -1.0;

}

static __global__ void ComplexConjugatev3(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[512];

tab[threadID] = a[blockIdx.x * blockDim.x + threadID];

tab[threadID].y *= - 1.0;

a[blockIdx.x * blockDim.x + threadID] = tab[threadID];

}

static __global__ void ComplexConjugatev4(Complex* a)

{

const int threadID = threadIdx.x;

__shared__ cufftComplex tab[128];

if ( threadID == 0 ){

for( unsigned int i = 0; i < 128; ++i) {

tab[i] = a[blockIdx.x * blockDim.x + i];

}

}

__syncthreads();

tab[threadID].y *= - 1.0;

__syncthreads();

if ( threadID == 0) {

for( unsigned int j = 0; j < 128; ++j) {

a[blockIdx.x * blockDim.x + j] = tab[j];

}

}

}

Debdatta · November 29, 2010, 7:33am

Why are you loading the entire shared memory from just one thread? And since ur doing that, why are you doing syncthreads?

The reason kernel4 is faster is because you are using shared memory… Try this piece of code instead:

It should be quite a bit faster…

Mitsu_DunDee · December 2, 2010, 1:46pm

BTW kernel 4 was not the fastest !!

Well the fastest approach was number 3, that is the one where each thread put 1 complex in a shared array of complex, do the conjugate thing and put the computed value back to the global memory.

Since my hardware (quadro 2700M) is a 1.1 type, it seems (I remember that is written somewhere) that the “coalesced” thing does not happen properly, as a consequence there is some optimisation

that are so much dependent on the code that it cannot be guessed…

Anyway thanks for your advice.

Regards.K

Topic		Replies	Views
small bench accessing global mem vs shared CUDA Programming and Performance	1	5982	November 24, 2010
matrix multiply reduction CUDA Programming and Performance	41	35554	January 15, 2011
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16304	January 30, 2011
Can you GUESS this without experimenting? Latencies CUDA Programming and Performance	13	9349	January 7, 2008
Why my ldmatrix PTX instruction is wrong? CUDA Programming and Performance cuda	10	759	March 30, 2024
Cuda code performance CUDA Programming and Performance	14	3155	December 16, 2014
cuda memory issues Legacy PGI Compilers	13	9511	March 19, 2012
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	680	April 4, 2017
Why cuda kernel use unexpected stack frame? CUDA Programming and Performance cuda , kernel	8	330	April 3, 2024
why result varied based on different number of threads per block? CUDA Programming and Performance	8	1943	March 1, 2011

explanantion on point-wise computation

Related topics