cudaMemcpy and 3.1 cudaMemcpy to host not working.

GPU_qd* d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(GPU_qd)len);
GPU_qd
h1_c=(GPU_qd*)malloc(sizeof(GPU_qd)*len);
//kernel
const int numBlock = 256;
const int numThread = 256;
map_log_kernel_qd<<<numBlock, numThread>>>(r,t ,d_out,begin, len);
//copy results from GPU
cudaMemcpy(h1_c, d_out, sizeof(GPU_qd)*len,cudaMemcpyDeviceToHost);
printf(“h1_c %f\n”,h1_c[0].d1.x);

     By using printf in the kernel I know non-zero values are being assigned to d_out  but when I print out h1_c
     I get zero and it should be a  large number. This is on a GTX 470. using toolkit 3.1 and Visual Studio 2008 on Windows 7 Ultimate.
                                  Mike

Can you give more kernel code plz? Looks ok at first sight.

Can you give more kernel code plz? Looks ok at first sight.

global

void map_log_kernel_qd( const GPU_qd r, const GPU_qd t, GPU_qd* d_out,const unsigned long long begin, const int numElement ) {

const unsigned long long index = blockIdx.x*blockDim.x + threadIdx.x;

const unsigned int delta = blockDim.x*gridDim.x;

for( unsigned long long i = index+begin; i <begin+ numElement; i += delta ) {

    

	GPU_qd d_i=make_qd((double)i);

	d_out[(int)(i-begin)] =r-t*log( d_i );

}

}

This is the kernel. d_out is a local variable for the function that calls the kernel. Thanks for your help.

                                   Mike

global

void map_log_kernel_qd( const GPU_qd r, const GPU_qd t, GPU_qd* d_out,const unsigned long long begin, const int numElement ) {

const unsigned long long index = blockIdx.x*blockDim.x + threadIdx.x;

const unsigned int delta = blockDim.x*gridDim.x;

for( unsigned long long i = index+begin; i <begin+ numElement; i += delta ) {

    

	GPU_qd d_i=make_qd((double)i);

	d_out[(int)(i-begin)] =r-t*log( d_i );

}

}

This is the kernel. d_out is a local variable for the function that calls the kernel. Thanks for your help.

                                   Mike

global

void map_log_kernel_qd( const GPU_qd r, const GPU_qd t, GPU_qd* d_out,const unsigned long long begin, const int numElement ) {

const unsigned long long index = blockIdx.x*blockDim.x + threadIdx.x;

const unsigned int delta = blockDim.x*gridDim.x;

for( unsigned long long i = index+begin; i <begin+ numElement; i += delta ) {

    

	GPU_qd d_i=make_qd((double)i);

	d_out[(int)(i-begin)] =r-t*log( d_i );

}

}

This is the kernel. d_out is a local variable for the function that calls the kernel. Thanks for your help.

                                   Mike

global

void map_log_kernel_qd( const GPU_qd r, const GPU_qd t, GPU_qd* d_out,const unsigned long long begin, const int numElement ) {

const unsigned long long index = blockIdx.x*blockDim.x + threadIdx.x;

const unsigned int delta = blockDim.x*gridDim.x;

for( unsigned long long i = index+begin; i <begin+ numElement; i += delta ) {

    

	GPU_qd d_i=make_qd((double)i);

	d_out[(int)(i-begin)] =r-t*log( d_i );

}

}

This is the kernel. d_out is a local variable for the function that calls the kernel. Thanks for your help.

                                   Mike