Reporting a problem while using double precision in cuda 2.0 double precision has a strange behavor&

HardWare: Tesla C1060

software: Linux 2.6.18-53.el5

I meet a strange problem with double precision computing. When I tried to use double in vvolkov’s speedy FFT, I meet a strange problem. To make it simple , I wrote a test sample:

#include <stdlib.h>

#include <string.h>

#include <float.h>

#include <time.h>

#include <sys/time.h>

inline double2 operator*( double2 a, double2 b ) { return make_double2( a.x*b.x-a.y*b.y, a.x*b.y+a.y*b.x ); }

inline double2 operator+( double2 a, double2 b ) { return make_double2( a.x + b.x, a.y + b.y ); }

inline double2 operator-( double2 a, double2 b ) { return make_double2( a.x - b.x, a.y - b.y ); }

	

__global__ void FUN_DOUBLE_CORE( double2 *n)

{	   

		int tid = threadIdx.x;	 

		

		double2 c0 =  n[tid]; 

		n[tid] = c0 + n[tid+32];

		n[tid+32] = c0 - n[tid+32];  

}	   

void FUN_DOUBLE( double2 *n)

{		

		FUN_DOUBLE_CORE<<< 1, 32 >>>(n);

}		

		

__global__ void FUN_SINGLE_CORE( float2 *n)

{			

		int tid = threadIdx.x;

		

		float2 c0 =  n[tid]; 

		n[tid] = c0 + n[tid+32];

		n[tid+32] = c0 - n[tid+32];

}	   

	

void FUN_SINGLE( float2 *n) 

{   

		FUN_SINGLE_CORE<<< 1, 32 >>>(n);

}

//  

//  MAIN

//  

int main( int argc, char **argv )

{	

	int idevice = 0;

	for( int i = 1; i < argc-1; i ++ )

	if( strcmp( argv[i], "-device" ) == 0 )

		idevice = atoi( argv[i+1] );

	cudaSetDevice( idevice );

	struct cudaDeviceProp prop;

	cudaGetDeviceProperties( &prop, idevice );

	printf( "\nDevice: %s, %.0f MHz clock, %.0f MB memory.\n", prop.name, prop.clockRate/1000.f, prop.totalGlobalMem/1024.f/1024.f );

	double2 *n1;

	double2 *d_n;

	float2  *n2;

	float2  *s_n;

	int size = 64;

	

	n1 = (double2 *)malloc(size*sizeof(double2));

	n2 = (float2 *)malloc(size*sizeof(float2));

	for(int i=0; i<size; i++){

		n1[i].x=1; n1[i].y=2;

		n2[i].x=1; n2[i].y=2;

	}

	cudaMalloc( (void**)&d_n, size*sizeof(double2));

	cudaMemcpy( d_n, n1, size*sizeof(double2), cudaMemcpyHostToDevice);

	FUN_DOUBLE(d_n);

	cudaMemcpy( n1, d_n, size*sizeof(double2), cudaMemcpyDeviceToHost);

	for(int i=0; i<size; i++){

		printf("double: n1[%d].x = %f n1[%d].y = %f\n", i,n1[i].x, i,n1[i].y);

	}

	cudaMalloc( (void**)&s_n, size*sizeof(float2));

	cudaMemcpy( s_n, n2, size*sizeof(float2), cudaMemcpyHostToDevice );

	FUN_SINGLE(s_n);

	cudaMemcpy( n2, s_n, size*sizeof(float2), cudaMemcpyDeviceToHost);

	

	for(int i=0; i<size; i++){

		printf("float :n2[%d].x = %f n2[%d].y = %f\n", i,n2[i].x, i,n2[i].y);

	}

	return 0;

}

And I notice that the result of FUN_DOUBLE is totally wrong. What did it happen? Or I make some mistake?

Furthermore, to make is much more simple, I rewrite the function FUN_DOUBLE_CORE:

__global__ void FUN_DOUBLE_CORE( double2 *n)

{	   

		int tid = threadIdx.x;		 

		n[tid] = 8.0; 

}

But when I printf the value of the double n2, they never changed! I am totally Confused…

did you compile with -arch sm_13?

oops, I will try it later ,Thx…

Why doesn’t the compiler print warning messages when doubles are being demoted to floats?

I have tried, with -arch sm_13 is the right way. ^_^

Amen to this. I think the current behavior of auto-demotion is horrible. I don’t think anyone is going to write kernels which rely on this compiler behavior. Please change this, NVIDIA! (Or at least give us a compiler option we can use to make this sane.)