This is my kernel function, everything else works fine except for the y2=cuCimag(), the results running in graphics is











but in emu mode I can get the correct result as :











what’s wrong with it ???

Could you post a complete repro?

Could you also try to use alpha[i].x and alpha[i].y to get the real and imaginary part?

I tried alpha[i].x and alpha[i].y but still failed, this is my whole code

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include <cuComplex.h>

#include <cutil_inline.h>

double randDouble(double low, double high)


  return (rand() / (static_cast<double>(RAND_MAX) + 1.0))* (high - low) + low;


void randomInit(cuDoubleComplex *data, unsigned int length)


  for (int i = 0; i < length; i++)

	{ double a=randDouble(-1e-7,1e-7); // create random double between -0.01 and 0.01

	  double b=randDouble(-1e-7,1e-7);






void runTest(int argc, char** argv);

__global__ void msinc_kernel(unsigned int na, cuDoubleComplex *alpha, double pm, cuDoubleComplex *z)


  double eps1=sqrt(6.0*pm);

  int i = blockIdx.x * blockDim.x + threadIdx.x;

  double x2=alpha[i].x;

  double y2=alpha[i].y;

if ( i < na)

	 if (cuCabs(alpha[i]) > eps1)


	   double a=exp(-2.0*y2)*cos(2.0*y2)-1.0, b=sin(2.0*x2)*exp(-2.0*y2);

	   double c=-2.0*y2, d=2.0*x2;

//	   z[i] = make_cuDoubleComplex((a*c+b*d)/(c*c+d*d), (b*c-a*d)/(c*c+d*d));




	   double x1 = (1-x2*x2/6.0+y2*y2/6.0)*cos(x2)-x2*y2*sin(x2)/3.0;

	   double y1 = (1-x2*x2/6.0+y2*y2/6.0)*sin(x2)+x2*y2*cos(x2)/3.0;

//	   z[i] = make_cuDoubleComplex(x1, y1);


z[i]=make_cuDoubleComplex(x2, y2);





main(int argc, char** argv)


	runTest(argc, argv);

	cutilExit(argc, argv);


void runTest(int argc, char** argv)


  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, argv);


		cudaSetDevice( cutGetMaxGflopsDeviceId() );

// set seed for rand()


unsigned int na=10;  // vector length

  double pm=1.110223e-16;  // machine precision

  unsigned int mem_size=sizeof(cuDoubleComplex)*na;

// Allocate memory for arrays alpha_h and y_h on CPU

  cuDoubleComplex* alpha_h = (cuDoubleComplex*) malloc ( mem_size );

  cuDoubleComplex* y_h = (cuDoubleComplex*) malloc ( mem_size );

// Initiate random complex vector

  randomInit ( alpha_h , na );

// Define and allocate memory for arrays alpha_d and y_d on GPU

  cuDoubleComplex* alpha_d;

  cutilSafeCall(cudaMalloc ( ( void** ) &alpha_d, mem_size ));

  cuDoubleComplex* y_d;

  cutilSafeCall(cudaMalloc ( ( void** ) &y_d, mem_size ));

// Copy alpha_h on main memory to alpha_d on GPU memory

  cutilSafeCall(cudaMemcpy( alpha_d, alpha_h, mem_size, cudaMemcpyHostToDevice ));

dim3 dimBlock(8);

  dim3 dimGrid((na+dimBlock.x-1)/dimBlock.x);

// call kernel function to run on GPU

  msinc_kernel<<<dimGrid, dimBlock>>>(na,alpha_d,pm,y_d);

cutilCheckMsg("Kernel execution failed");

// Copy y_d on GPU memory to y_h on memory

  cutilSafeCall(cudaMemcpy( y_h, y_d, mem_size, cudaMemcpyDeviceToHost ));

for ( int i = 0; i < na; i++ )

	  printf("(%e,%e)\n", cuCreal(y_h[i]), cuCimag(y_h[i]));








I found that I didn’t get the right answer, so I try to see if I pass the right data to the kernel, so I get the results as:





















so I get the right real part but the wrong imaginary part

It seems to work fine on the system I tested ( Linux 64, CUDA 2.2)

nvcc -arch sm_13 -I ~/NVIDIA_CUDA_SDK_2.2/common/inc/ -L ~/NVIDIA_CUDA_SDK_2.2/lib/ -lcutil


Thx, I didn’t include -arch sm_13 when compiling