Bug at Memcpy with double.

When I try to do:
cudaMemcpy(b_h, a_d, sizeof(double)*N, cudaMemcpyDeviceToHost);
when b_h and a_d are double, it’s not working, (b_h doesnt get a_d).
But when b_h and a_d are float, it’s working.
have someone the same problem?
Amir

could you provide a simple code to demonstrate this?

because in my experience, a cudaMemcpy works for “float” and “double”

// incrementArray.cu

include <stdio.h>

include <assert.h>

include <cuda.h>

void incrementArrayOnHost(double *a, int N)

{

int i;

for (i=0; i < N; i++) a[i] = a[i]+1;

}

global void incrementArrayOnDevice(double *a, int N)

{

int idx = blockIdx.x*blockDim.x + threadIdx.x;

a[0]=5.5;

}

int main(void)

{

double *a_h, *b_h; // pointers to host memory

double *a_d; // pointer to device memory

int i, N = 1000;

size_t size = N*sizeof(double);

// allocate arrays on host

a_h = (double *)malloc(size);

b_h = (double *)malloc(size);

// cudaMallocHost ( (void**) &b_h, size );

// allocate array on device

cudaMalloc((void **) &a_d, size);

// initialization of host data

for (i=0; i<N; i++) a_h[i] = (double)i;

// copy data from host to device

cudaMemcpy(a_d, a_h, sizeof(double)*N, cudaMemcpyHostToDevice);

// do calculation on host

incrementArrayOnHost(a_h, N);

// do calculation on device:

// Part 1 of 2. Compute execution configuration

int blockSize = 4;

int nBlocks = N/blockSize + (N%blockSize == 0?0:1);

printf(“a_d[0]=%f\n”,b_h[0]);

// Part 2 of 2. Call incrementArrayOnDevice kernel

incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);

// Retrieve result from device and store in b_h

cudaMemcpy(b_h, a_d, sizeof(double)*N, cudaMemcpyDeviceToHost);

printf(“a_d[0]=%f\n”,b_h[0]);

// check results

//a_h[0]+=1;

for (i=0; i<N; i++) assert(a_h[i] == b_h[i]);

// cleanup

free(a_h); free(b_h); free(a_d);

}

It is likely that the kernel isn’t actually running, either because you are running on a compute 1.1 capability card, or because you aren’t building for compute capability 1.3 (passing -sm13 to nvcc),

Yes , I am running on a compute 1.1 capability card.

What are the problem?

They don’t support double precision, that’s the problem.

Ok

Thanks