HardWare: Tesla C1060
software: Linux 2.6.18-53.el5
I meet a strange problem with double precision computing. When I tried to use double in vvolkov’s speedy FFT, I meet a strange problem. To make it simple , I wrote a test sample:
#include <stdlib.h>
#include <string.h>
#include <float.h>
#include <time.h>
#include <sys/time.h>
inline double2 operator*( double2 a, double2 b ) { return make_double2( a.x*b.x-a.y*b.y, a.x*b.y+a.y*b.x ); }
inline double2 operator+( double2 a, double2 b ) { return make_double2( a.x + b.x, a.y + b.y ); }
inline double2 operator-( double2 a, double2 b ) { return make_double2( a.x - b.x, a.y - b.y ); }
__global__ void FUN_DOUBLE_CORE( double2 *n)
{
int tid = threadIdx.x;
double2 c0 = n[tid];
n[tid] = c0 + n[tid+32];
n[tid+32] = c0 - n[tid+32];
}
void FUN_DOUBLE( double2 *n)
{
FUN_DOUBLE_CORE<<< 1, 32 >>>(n);
}
__global__ void FUN_SINGLE_CORE( float2 *n)
{
int tid = threadIdx.x;
float2 c0 = n[tid];
n[tid] = c0 + n[tid+32];
n[tid+32] = c0 - n[tid+32];
}
void FUN_SINGLE( float2 *n)
{
FUN_SINGLE_CORE<<< 1, 32 >>>(n);
}
//
// MAIN
//
int main( int argc, char **argv )
{
int idevice = 0;
for( int i = 1; i < argc-1; i ++ )
if( strcmp( argv[i], "-device" ) == 0 )
idevice = atoi( argv[i+1] );
cudaSetDevice( idevice );
struct cudaDeviceProp prop;
cudaGetDeviceProperties( &prop, idevice );
printf( "\nDevice: %s, %.0f MHz clock, %.0f MB memory.\n", prop.name, prop.clockRate/1000.f, prop.totalGlobalMem/1024.f/1024.f );
double2 *n1;
double2 *d_n;
float2 *n2;
float2 *s_n;
int size = 64;
n1 = (double2 *)malloc(size*sizeof(double2));
n2 = (float2 *)malloc(size*sizeof(float2));
for(int i=0; i<size; i++){
n1[i].x=1; n1[i].y=2;
n2[i].x=1; n2[i].y=2;
}
cudaMalloc( (void**)&d_n, size*sizeof(double2));
cudaMemcpy( d_n, n1, size*sizeof(double2), cudaMemcpyHostToDevice);
FUN_DOUBLE(d_n);
cudaMemcpy( n1, d_n, size*sizeof(double2), cudaMemcpyDeviceToHost);
for(int i=0; i<size; i++){
printf("double: n1[%d].x = %f n1[%d].y = %f\n", i,n1[i].x, i,n1[i].y);
}
cudaMalloc( (void**)&s_n, size*sizeof(float2));
cudaMemcpy( s_n, n2, size*sizeof(float2), cudaMemcpyHostToDevice );
FUN_SINGLE(s_n);
cudaMemcpy( n2, s_n, size*sizeof(float2), cudaMemcpyDeviceToHost);
for(int i=0; i<size; i++){
printf("float :n2[%d].x = %f n2[%d].y = %f\n", i,n2[i].x, i,n2[i].y);
}
return 0;
}
And I notice that the result of FUN_DOUBLE is totally wrong. What did it happen? Or I make some mistake?