This is an example of my code, as u can see i am setting device to host transfer, and my dest* is a host pointer and my src* is a device pointer.
#include <stdio.h>
global void BasicExample(int* D)
{
for(int k = 0; k < 1024; ++k)
{
D[k] = k;
}
}
int main(int argc, char *argv)
{
int device = 0;
printf(“size of ptr: %li \n”, sizeof(void**));
cudaSetDevice(device);
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, device);
/////////////////////////////////////////////
// setup data
//
// allocate CPU and GPU memory
N = 1024;
size = N * sizeof(int);
int h_D[N]; //host pointer
int* d_D; //Device Pointer
cudaMalloc((void**)&d_D, size);
kernel launch configuration
dim3 dimBlock(4,4); //random stuff
dim3 dimGrid(4, 4); //random stuff
BasicExample<<<dimGrid,dimBlock>>>(d_D);
cudaMemcpy(&h_D[0], d_D, size, cudaMemcpyDeviceToHost));
if( cudaGetLastError() != cudaSuccess )
{
//it always enter here
printf("-----------------------------------\n");
printf("data download from GPU failed\n");
printf("-----------------------------------\n");
exit(-2);
return 0;
}
cudaThreadExit();
exit(EXIT_SUCCESS);
return 0;
}