Hello I’ve posted this in the “General CUDA GPU Computing Discussion” but got no answear,
so I’ll try here instead !
I am starting cuda programming on a quadro FX 2700M on a hp notebook,
after reading some tutorials and some cuda sdk, I’ve tested the following code (
here is just the important func). Basically all it does is to compute the point-wise conjugate of an array of complex.
The obtained results are puzzling me.
The 3rd approach obviously gives the best results (on my computer at least) but the
high price operation are : 1 load from global and 1 write to global, that is the very same
load/write than the number 1 and 2 approaches.
// Complex conjugate
static __global__ void ComplexConjugatev2(Complex* a)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID].y *= -1.0;
}
static __global__ void ComplexConjugatev3(Complex* a)
{
const int threadID = threadIdx.x;
__shared__ cufftComplex tab[512];
tab[threadID] = a[blockIdx.x * blockDim.x + threadID];
tab[threadID].y *= - 1.0;
a[blockIdx.x * blockDim.x + threadID] = tab[threadID];
}
The compile information for the ptxas does not seem to better explain things to me :
ptxas info : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_20'
ptxas info : Used 10 registers, 1024+0 bytes smem, 40 bytes cmem[0], 4 bytes cmem[16]
ptxas info : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_20'
ptxas info : Used 12 registers, 4096+0 bytes smem, 40 bytes cmem[0]
ptxas info : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_20'
ptxas info : Used 6 registers, 40 bytes cmem[0]
ptxas info : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_20'
ptxas info : Used 4 registers, 40 bytes cmem[0]
ptxas /tmp/tmpxft_00007e9f_00000000-2_testAccess.compute_10.ptx, line 99; warning : Double is not supported. Demoting to float
ptxas info : Compiling entry function '_Z18ComplexConjugatev4P6float2' for 'sm_10'
ptxas info : Used 5 registers, 1032+16 bytes smem
ptxas info : Compiling entry function '_Z18ComplexConjugatev3P6float2' for 'sm_10'
ptxas info : Used 4 registers, 4104+16 bytes smem
ptxas info : Compiling entry function '_Z18ComplexConjugatev2P6float2' for 'sm_10'
ptxas info : Used 2 registers, 8+16 bytes smem
ptxas info : Compiling entry function '_Z16ComplexConjugateP6float2' for 'sm_10'
ptxas info : Used 2 registers, 8+16 bytes smem
Any explanation ?
Regards.
RESULTS
=============================
= Testing complex conjugate =
=============================
= approach1 gives : 97.6781
=============================
= approach2 gives : 97.5815
=============================
= approach3 gives : 21.838
=============================
= approach4 gives : 813.004
=============================
void testConjugate(int argc, char** argv){
// set timers
double time_spent_std = 0.0;
double time_spent_special = 0.0;
double time_spent_special2 = 0.0;
double time_spent_special3 = 0.0;
unsigned int timer_std;
unsigned int timer_special;
unsigned int timer_special2;
unsigned int timer_special3;
cutilCheckError(cutCreateTimer(&timer_std));
cutilCheckError(cutCreateTimer(&timer_special));
cutilCheckError(cutCreateTimer(&timer_special2));
cutilCheckError(cutCreateTimer(&timer_special3));
cutilCheckError(cutResetTimer(timer_std));
cutilCheckError(cutResetTimer(timer_special));
cutilCheckError(cutResetTimer(timer_special2));
cutilCheckError(cutResetTimer(timer_special3));
unsigned int dim = 128;
unsigned int xp_number = 10;
unsigned int lim = 20;
cufftComplex *tmp1_var;
// alloc KERNEL CONJUGATES ON DEVICE
cutilSafeCall(cudaMalloc( (void **) &tmp1_var, dim*dim*dim*sizeof(cufftComplex)));
for( unsigned int j = 0; j < xp_number; ++j) {
/////////////////////////////////////////////////////
// method 1
cutStartTimer(timer_std);
for (unsigned int i = 0; i < lim ; ++i) {
//compute spectral conjugate
ComplexConjugate<<<4096,512>>> (tmp1_var);
}
cudaThreadSynchronize();
cutStopTimer(timer_std);
time_spent_std += cutGetTimerValue(timer_std);
cutResetTimer(timer_std);
cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
// method 2
cutStartTimer(timer_special);
for (unsigned int i = 0; i < lim; ++i) {
//compute spectral conjugate
ComplexConjugatev2<<<4096,512>>> (tmp1_var);
}
cudaThreadSynchronize();
cutStopTimer(timer_special);
time_spent_special += cutGetTimerValue(timer_special);
cutResetTimer(timer_special);
cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
// method 3
cutStartTimer(timer_special2);
for (unsigned int i = 0; i < lim; ++i) {
//compute spectral conjugate
ComplexConjugatev3<<<4*4096,512/4>>> (tmp1_var);
}
cudaThreadSynchronize();
cutStopTimer(timer_special2);
time_spent_special2 += cutGetTimerValue(timer_special2);
cutResetTimer(timer_special2);
cutilCheckMsg("Something went wrong");
/////////////////////////////////////////////////////
// method 4
cutStartTimer(timer_special3);
for (unsigned int i = 0; i < lim; ++i) {
//compute spectral conjugate
ComplexConjugatev4<<<4*4096,128>>> (tmp1_var);
}
cudaThreadSynchronize();
cutStopTimer(timer_special3);
time_spent_special3 += cutGetTimerValue(timer_special3);
cutResetTimer(timer_special3);
cutilCheckMsg("Something went wrong");
}
std::cout << "=============================" << std::endl;
std::cout << "= Testing complex conjugate =" << std::endl;
std::cout << "=============================" << std::endl;
std::cout << "= approach1 gives : " << time_spent_std/((double) xp_number) << std::endl;
std::cout << "=============================" << std::endl;
std::cout << "= approach2 gives : " << time_spent_special/((double) xp_number) << std::endl;
std::cout << "=============================" << std::endl;
std::cout << "= approach3 gives : " << time_spent_special2/((double) xp_number) << std::endl;
std::cout << "=============================" << std::endl;
std::cout << "= approach4 gives : " << time_spent_special3/((double) xp_number) << std::endl;
std::cout << "=============================" << std::endl;
cudaFree(tmp1_var);
}
// Complex conjugate
static __global__ void ComplexConjugate(Complex* a)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID].y = - a[threadID].y;
}
// Complex conjugate
static __global__ void ComplexConjugatev2(Complex* a)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
a[threadID].y *= -1.0;
}
static __global__ void ComplexConjugatev3(Complex* a)
{
const int threadID = threadIdx.x;
__shared__ cufftComplex tab[512];
tab[threadID] = a[blockIdx.x * blockDim.x + threadID];
tab[threadID].y *= - 1.0;
a[blockIdx.x * blockDim.x + threadID] = tab[threadID];
}
static __global__ void ComplexConjugatev4(Complex* a)
{
const int threadID = threadIdx.x;
__shared__ cufftComplex tab[128];
if ( threadID == 0 ){
for( unsigned int i = 0; i < 128; ++i) {
tab[i] = a[blockIdx.x * blockDim.x + i];
}
}
__syncthreads();
tab[threadID].y *= - 1.0;
__syncthreads();
if ( threadID == 0) {
for( unsigned int j = 0; j < 128; ++j) {
a[blockIdx.x * blockDim.x + j] = tab[j];
}
}
}