Hi,
I have a very annoying issue here. Basically, the double computation in my code running on Tesla C1060 has an problem, but not on Tesla C2050.
Just multiply a floating point number by 32 and cast to integer. I found that on c1060, the result of double is wrong but single precision is fine. This difference between single and double does not appear on my C2050.
To compile the code, for c1060 I have -arch=sm_12. For c2050 I have -arch=sm_20 or sm_12. Test codes are:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <cutil_inline.h>
#define BlockSize 128
__global__ void testD(double3* d_P, int *re){
double3 P=d_P[0];
int temp;
temp=(int)(P.x*32);
re[0]=temp;
}
__global__ void testS(float3* d_P, int *re){
float3 P=d_P[0];
int temp;
temp=(int)(P.x*32);
re[0]=temp;
}
main(){
double3 *d_Pd, Pd;
Pd.x=0.840188;
Pd.y=0.394383;
Pd.z=0.783099;
float3 *d_Ps, Ps;
Ps.x=0.840188;
Ps.y=0.394383;
Ps.z=0.783099;
int* d_I,I=0;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_Pd,sizeof(double3)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_Ps,sizeof(float3)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_I,sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(d_Pd, &Pd, sizeof(double3), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_Ps, &Ps, sizeof(float3), cudaMemcpyHostToDevice));
testD<<<1,1>>>(d_Pd,d_I);
CUDA_SAFE_CALL(cudaGetLastError());
CUDA_SAFE_CALL(cudaMemcpy(&I, d_I, sizeof(int), cudaMemcpyDeviceToHost));
printf("double to int: %d\n",I);
testS<<<1,1>>>(d_Ps,d_I);
CUDA_SAFE_CALL(cudaGetLastError());
CUDA_SAFE_CALL(cudaMemcpy(&I, d_I, sizeof(int), cudaMemcpyDeviceToHost));
printf("single to int: %d\n",I);
}
/*
compile command
nvcc -o cudaBug -arch=sm_12 -I/usr/local/stow/cudatoolkit_3.2.16/include -I/home_path/NVIDIA_GPU_Computing_SDK/C/common/inc -L/usr/local/stow/cudatoolkit_3.2.16/lib64 -lcudart -lstdc++ cudaBug.cu
*/
anyone can help me figure out why the different returns? thanks