cuCimag fails

leeiq · July 30, 2009, 3:43am

__global__ void msinc_kernel(unsigned int na, cuDoubleComplex *alpha, double pm, cuDoubleComplex *y)

{

  double eps1=sqrt(6.0*pm);

  int i = blockIdx.x * blockDim.x + threadIdx.x;

  double x=cuCreal(alpha[i]);

  double y2=cuCimag(alpha[i]);

if ( i < na)

	 if (cuCabs(alpha[i]) > eps1)

	{

	   double a=exp(-2.0*y2)*cos(2.0*y2)-1.0, b=sin(2.0*x)*exp(-2.0*y2);

	   double c=-2.0*y2, d=2.0*x;

	   y[i] = make_cuDoubleComplex((a*c+b*d)/(c*c+d*d), (b*c-a*d)/(c*c+d*d));

	 }

   else

   	{

	   double x1 = (1-x*x/6.0+y2*y2/6.0)*cos(x)-x*y2*sin(x)/3.0;

	   double y1 = (1-x*x/6.0+y2*y2/6.0)*sin(x)+x*y2*cos(x)/3.0;

	   y[i] = make_cuDoubleComplex(x1, y1);

	 }

y[i]=make_cuDoubleComplex(cuCreal(alpha[i]), cuCimag(alpha[i]));

__syncthreads();

return;

}

This is my kernel function, everything else works fine except for the y2=cuCimag(), the results running in graphics is

(-9.686006e-08,-1.409392e+307)

(4.885526e-08,-1.550387e+307)

(4.621565e-08,-1.550387e+307)

(-2.569997e-08,-1.409392e+307)

(3.371802e-08,-1.268397e+307)

(-3.896651e-08,-1.691383e+307)

(8.329686e-08,-1.338894e+307)

(-5.119202e-08,-1.268396e+307)

(-3.486711e-08,-1.338895e+307)

(-7.364092e-08,-1.550387e+307)

but in emu mode I can get the correct result as :

(-9.686006e-08,7.296060e-08)

(4.885526e-08,-8.956707e-08)

(4.621565e-08,-5.357445e-08)

(-2.569997e-08,6.052441e-08)

(3.371802e-08,9.878985e-08)

(-3.896651e-08,8.984847e-08)

(8.329686e-08,-2.966926e-08)

(-5.119202e-08,1.571757e-08)

(-3.486711e-08,-1.148769e-08)

(-7.364092e-08,-8.289431e-08)

what’s wrong with it ???

mfatica · July 30, 2009, 4:09am

Could you post a complete repro?

Could you also try to use alpha[i].x and alpha[i].y to get the real and imaginary part?

leeiq · July 30, 2009, 1:35pm

I tried alpha[i].x and alpha[i].y but still failed, this is my whole code

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include <cuComplex.h>

#include <cutil_inline.h>

double randDouble(double low, double high)

{ 

  return (rand() / (static_cast<double>(RAND_MAX) + 1.0))* (high - low) + low;

}

void randomInit(cuDoubleComplex *data, unsigned int length)

{

  for (int i = 0; i < length; i++)

	{ double a=randDouble(-1e-7,1e-7); // create random double between -0.01 and 0.01

	  double b=randDouble(-1e-7,1e-7);

	  data[i]=make_cuDoubleComplex(a,b);

	  printf("(%e,%e)\n",cuCreal(data[i]),data[i].y);

	}

  return;

}

void runTest(int argc, char** argv);

__global__ void msinc_kernel(unsigned int na, cuDoubleComplex *alpha, double pm, cuDoubleComplex *z)

{

  double eps1=sqrt(6.0*pm);

  int i = blockIdx.x * blockDim.x + threadIdx.x;

  double x2=alpha[i].x;

  double y2=alpha[i].y;

if ( i < na)

	 if (cuCabs(alpha[i]) > eps1)

	{

	   double a=exp(-2.0*y2)*cos(2.0*y2)-1.0, b=sin(2.0*x2)*exp(-2.0*y2);

	   double c=-2.0*y2, d=2.0*x2;

//	   z[i] = make_cuDoubleComplex((a*c+b*d)/(c*c+d*d), (b*c-a*d)/(c*c+d*d));

	 }

   else

   	{

	   double x1 = (1-x2*x2/6.0+y2*y2/6.0)*cos(x2)-x2*y2*sin(x2)/3.0;

	   double y1 = (1-x2*x2/6.0+y2*y2/6.0)*sin(x2)+x2*y2*cos(x2)/3.0;

//	   z[i] = make_cuDoubleComplex(x1, y1);

	 }

z[i]=make_cuDoubleComplex(x2, y2);

__syncthreads();

return;

}

int

main(int argc, char** argv)

{

	runTest(argc, argv);

	cutilExit(argc, argv);

}

void runTest(int argc, char** argv)

{

  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, argv);

  else

		cudaSetDevice( cutGetMaxGflopsDeviceId() );

// set seed for rand()

  srand(2006);

unsigned int na=10;  // vector length

  double pm=1.110223e-16;  // machine precision

  unsigned int mem_size=sizeof(cuDoubleComplex)*na;

// Allocate memory for arrays alpha_h and y_h on CPU

  cuDoubleComplex* alpha_h = (cuDoubleComplex*) malloc ( mem_size );

  cuDoubleComplex* y_h = (cuDoubleComplex*) malloc ( mem_size );

// Initiate random complex vector

  randomInit ( alpha_h , na );

// Define and allocate memory for arrays alpha_d and y_d on GPU

  cuDoubleComplex* alpha_d;

  cutilSafeCall(cudaMalloc ( ( void** ) &alpha_d, mem_size ));

  cuDoubleComplex* y_d;

  cutilSafeCall(cudaMalloc ( ( void** ) &y_d, mem_size ));

// Copy alpha_h on main memory to alpha_d on GPU memory

  cutilSafeCall(cudaMemcpy( alpha_d, alpha_h, mem_size, cudaMemcpyHostToDevice ));

dim3 dimBlock(8);

  dim3 dimGrid((na+dimBlock.x-1)/dimBlock.x);

// call kernel function to run on GPU

  msinc_kernel<<<dimGrid, dimBlock>>>(na,alpha_d,pm,y_d);

cutilCheckMsg("Kernel execution failed");

// Copy y_d on GPU memory to y_h on memory

  cutilSafeCall(cudaMemcpy( y_h, y_d, mem_size, cudaMemcpyDeviceToHost ));

for ( int i = 0; i < na; i++ )

	  printf("(%e,%e)\n", cuCreal(y_h[i]), cuCimag(y_h[i]));

free(alpha_h);

  free(y_h);

  cutilSafeCall(cudaFree(alpha_d));

  cutilSafeCall(cudaFree(y_d));

cudaThreadExit();

return;

}

leeiq · July 30, 2009, 1:38pm

I found that I didn’t get the right answer, so I try to see if I pass the right data to the kernel, so I get the results as:

(-9.686006e-08,7.296060e-08)

(4.885526e-08,-8.956707e-08)

(4.621565e-08,-5.357445e-08)

(-2.569997e-08,6.052441e-08)

(3.371802e-08,9.878985e-08)

(-3.896651e-08,8.984847e-08)

(8.329686e-08,-2.966926e-08)

(-5.119202e-08,1.571757e-08)

(-3.486711e-08,-1.148769e-08)

(-7.364092e-08,-8.289431e-08)

(-9.686006e-08,-5.486129e+303)

(4.885526e-08,-5.486129e+303)

(4.621565e-08,-5.486129e+303)

(-2.569997e-08,-5.486129e+303)

(3.371802e-08,-5.486129e+303)

(-3.896651e-08,-5.486129e+303)

(8.329686e-08,-5.486129e+303)

(-5.119202e-08,-5.486129e+303)

(-3.486711e-08,-5.486129e+303)

(-7.364092e-08,-5.486129e+303)

so I get the right real part but the wrong imaginary part

mfatica · July 30, 2009, 3:35pm

It seems to work fine on the system I tested ( Linux 64, CUDA 2.2)

nvcc -arch sm_13 doublecomplex.cu -I ~/NVIDIA_CUDA_SDK_2.2/common/inc/ -L ~/NVIDIA_CUDA_SDK_2.2/lib/ -lcutil

./a.out
(-2.217051e-08,-7.827979e-08)
(-8.243062e-08,-7.754010e-08)
(-7.270555e-08,8.917002e-08)
(-6.431749e-08,-4.225036e-08)
(-4.162501e-08,6.429711e-08)
(-8.014401e-09,4.963334e-08)
(9.992520e-08,6.613526e-08)
(2.968757e-08,-9.818558e-08)
(5.884543e-08,-7.681754e-08)
(3.323577e-08,3.490619e-08)
(-2.217051e-08,-7.827979e-08)
(-8.243062e-08,-7.754010e-08)
(-7.270555e-08,8.917002e-08)
(-6.431749e-08,-4.225036e-08)
(-4.162501e-08,6.429711e-08)
(-8.014401e-09,4.963334e-08)
(9.992520e-08,6.613526e-08)
(2.968757e-08,-9.818558e-08)
(5.884543e-08,-7.681754e-08)
(3.323577e-08,3.490619e-08)

leeiq · July 31, 2009, 3:07am

Thx, I didn’t include -arch sm_13 when compiling