Hello,
the following testcase shows a really strange change in behaviuor:
#include <cutil_inline.h>
#include <stdio.h>
typedef float my_t;
__global__ void kill(my_t* X, int size) {
for(int i = 0; i < size; i++) {
for(int j = 0; j < size; j++) {
X[i*size + j] = 0;
}
}
}
void run(my_t* A, int size) {
my_t* devA;
cudaMalloc((void**)&devA, sizeof(my_t) * size * size);
cudaMemcpy(devA, A, sizeof(my_t) * size * size, cudaMemcpyHostToDevice);
printf("before %f\n", A[0]);
kill<<<1, 1>>>(devA, size);
cudaMemcpy(A, devA, sizeof(my_t) * size * size, cudaMemcpyDeviceToHost);
cudaFree(devA);
printf("after %f\n", A[0]);
}
void init(my_t* A, int size) {
for(int i = 0; i < size; i++) {
for(int j = 0; j < size; j++) {
A[i * size + j] = i + j + 1.0;
}
}
}
int main(int argc, char* argv[]) {
int size = 3;
my_t* A = (my_t*)malloc(sizeof(my_t)*size*size);
if(A == NULL) {
fprintf(stderr, "Can't alloc A\n");
exit(1);
}
printf("Running with %d byte floating point numbers\n", sizeof(my_t));
init(A, size);
run(A, size);
free(A);
}
nvcc --compiler-options -g -I/home/pkilian/cuda/include -I/home/pkilian/NVIDIA_CUDA_SDK/common/inc -o broken.cu.o -c broken.cu
g++ -g -O0 -o broken broken.cu.o -L/home/pkilian/cuda/lib -L/home/pkilian/NVIDIA_CUDA_SDK/lib -L/home/pkilian/NVIDIA_CUDA_SDK/common/lib/linux -lcudart -L/home/pkilian/cuda/lib -lcutil
./broken
Running with 4 byte floating point numbers
before 1.000000
after 0.000000
This is the behaviour I exspected. But if I change the “float” to “double” the following happens:
nvcc --compiler-options -g -I/home/pkilian/cuda/include -I/home/pkilian/NVIDIA_CUDA_SDK/common/inc -o broken.cu.o -c broken.cu
g++ -g -O0 -o broken broken.cu.o -L/home/pkilian/cuda/lib -L/home/pkilian/NVIDIA_CUDA_SDK/lib -L/home/pkilian/NVIDIA_CUDA_SDK/common/lib/linux -lcudart -L/home/pkilian/cuda/lib -lcutil
./broken
Running with 8 byte floating point numbers
before 1.000000
after 1.000000
A[0][0] is no longer set to 0!
This is with cuda toolkit 2.1, cuda sdk 2.1, g++ (GCC) 4.1.3 20080704 (prerelease) (Debian 4.1.2-25) on a quadcore 64 bit machine with Intel Core2 Quad CPU @ 2.40GHz and 4 GB of Ram. The graphics card is a GeForce GTX 295 from Gainward.
Did I miss a bug in my code? Is this a bug in Cuda? Or what’s going on here? Any pointers?