When I try to do:
cudaMemcpy(b_h, a_d, sizeof(double)*N, cudaMemcpyDeviceToHost);
when b_h and a_d are double, it’s not working, (b_h doesnt get a_d).
But when b_h and a_d are float, it’s working.
have someone the same problem?
Amir
could you provide a simple code to demonstrate this?
because in my experience, a cudaMemcpy works for “float” and “double”
// incrementArray.cu
include <stdio.h>
include <assert.h>
include <cuda.h>
void incrementArrayOnHost(double *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1;
}
global void incrementArrayOnDevice(double *a, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
a[0]=5.5;
}
int main(void)
{
double *a_h, *b_h; // pointers to host memory
double *a_d; // pointer to device memory
int i, N = 1000;
size_t size = N*sizeof(double);
// allocate arrays on host
a_h = (double *)malloc(size);
b_h = (double *)malloc(size);
// cudaMallocHost ( (void**) &b_h, size );
// allocate array on device
cudaMalloc((void **) &a_d, size);
// initialization of host data
for (i=0; i<N; i++) a_h[i] = (double)i;
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(double)*N, cudaMemcpyHostToDevice);
// do calculation on host
incrementArrayOnHost(a_h, N);
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = 4;
int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
printf(“a_d[0]=%f\n”,b_h[0]);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
// Retrieve result from device and store in b_h
cudaMemcpy(b_h, a_d, sizeof(double)*N, cudaMemcpyDeviceToHost);
printf(“a_d[0]=%f\n”,b_h[0]);
// check results
//a_h[0]+=1;
for (i=0; i<N; i++) assert(a_h[i] == b_h[i]);
// cleanup
free(a_h); free(b_h); free(a_d);
}
It is likely that the kernel isn’t actually running, either because you are running on a compute 1.1 capability card, or because you aren’t building for compute capability 1.3 (passing -sm13 to nvcc),
Yes , I am running on a compute 1.1 capability card.
What are the problem?
They don’t support double precision, that’s the problem.
Ok
Thanks