Copy data to device memory

I want to create a pointer in the host memory, add some data to the pointer and load it to the device memory to do calculations afterwards.

An example from the programming guide:
__device__ float* devPointer; float* ptr; cudaMalloc(&ptr, 256 * sizeof(float)); cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr));
My code:

`#include <stdio.h>
include “cuda_runtime.h”
include “device_launch_parameters.h”

device float* devPointer;

global void test()
{
printf(“devPointer[%d] = %g\n”,threadIdx.x, devPointer[threadIdx.x]);
}

int main(void)
{
float* ptr;
cudaMalloc(&ptr, 8 * sizeof(float));

//add data to the pointer  
for(int s = 0; s < 8 ; s++)  
   {
	ptr[s] = s;  
   }

cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr));

test<<<1,8>>>();  

return 0;  

}
`

I tried the above suggestion from the programming guide but I cannot get it to function properly. Help is highly appreciated.

float* ptr;
cudaMalloc(&ptr, 8 * sizeof(float));

//add data to the pointer  
for(int s = 0; s < 8 ; s++)  
   {
	ptr[s] = s;  
   }

ptr points to device memory. You cannot initialize the data with a for-loop on the host.

Thank you for pointing that out.

I could use:
float* ptr = (float*)malloc(8 * sizeof(float));
// add data
float* devPointer; cudaMalloc(&devPointer, 8 * sizeof(float));
cudaMemcpy(devPointer, ptr, 8 * sizeof(float), cudaMemcpyHostToDevice);

But then I need to invoke the devPointer : test<<<1,1>>>(devPointer);

If data is stored in the device than I don´t need to invoke. Gives me cleaner code.

Is there a way to store a pointer from the host in the device memory so I don´t need to invoke the pointer?

There is no such thing as invoking a pointer, it is called “dereferencing”.
Also your first example could be corrected to the following:

#include <stdio.h>
#include <stdlib.h>

__device__ float devPointer[8];
__global__ void test(){
    printf("devPointer[%d] = %f\n",threadIdx.x, devPointer[threadIdx.x]);
}

int main(int argc, char *argv[]) {
    float hostData[8] = {1,2,3,4,5,6,7,8};

    cudaMemcpyToSymbol(devPointer, hostData, 8 * sizeof(float));
    test<<<1,8>>>();
    cudaDeviceSynchronize();
}

Also, I’m pretty sure need to work on your C skills. A pointer is not an array. Mind this:

float* x;
sizeof(x); // Is ALWAYS going to be 4 on 32-bit and 8 on 64-bit systems

float arr[8];
sizeof(arr); //This is going to be 32 (8 * sizeof(float) = 8 * 4 = 32)

Thank you for the explanation Manuel. If I copy to the device memory, with the precursor__device__ , it needs to be of a constant value.

No it doesn’t.

I guess by this “invoke” you mean pass the pointer to device code.

Yes, and it is via declaration of a __device__ variable, and perhaps/typically initialization of that variable via cudaMemcpyToSymbol.

__device__ float *mypointer;

__global__ void k(){
  size_t size = sizeof(float)*256;
  mypointer = (float *)malloc(size);
}

or:

float *data;
cudaMalloc(&data, sizeof(float)*256);
cudaMemcpyToSymbol(mypointer, &data, sizeof(float *));

All of this seems to me to be a lot of additional boilerplate just to satisfy the request “so I don’t have to invoke”, but to each their own.

The term “invoke” is used in the cuda-c-programming-guide so that’s why I used it. I wrote: “If I copy to the device memory, with the precursor__device__ , it needs to be of a constant value.” I meant a constant size. But you explained that´s not the case.

Thank you for the explanation.

The result:

#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

size_t free_0 = 0, free_1 = 0, free_2 = 0, total = 0;

__device__ float *devPointer;


__global__ void test2()
  {
   printf("devPointer[%d] = %g\n",threadIdx.x, devPointer[threadIdx.x]);
  }

__global__ void test(float *devData)
	{
    printf("devData[%d] = %g\n",threadIdx.x, devData[threadIdx.x]);
	}

int main(void)

   {
	cudaMemGetInfo( &free_0, &total );
	printf("CUDA memory | free: %lu bytes , total: %lu bytes , difference: %lu bytes or %lu kilo bytes\n\n",free_0 , total, (total- free_0), (total- free_0)/1000);


	int N = 8;

    float *hostData;      hostData = (float*) malloc(N * sizeof(float));     // allocate memory on the host
    float *devData;       cudaMalloc((void **)&devData, N * sizeof(float));  // allocate memory on the device


    for( int s = 0; s < N; s++)
    	{
    	hostData[s] = s + (s / 10.0);
    	}

    for(int k = 0; k < N; k++)
    	{
    	printf("hostData[%d] = %g\n",k , hostData[k]) ;
    	}

    printf("\n");

    cudaMemcpy(devData, hostData, 8 * sizeof(float), cudaMemcpyHostToDevice);   // copy data from host to device

    test<<<1,N>>>(devData);
    cudaDeviceSynchronize();


    cudaMalloc((void **)&devPointer, N *sizeof(float));                          // allocate memory on the device
    cudaMemcpyToSymbol(devPointer, &devData, sizeof(float *));                   // copy data from device to device

    cudaMemGetInfo( &free_1, &total );											// extra memory (2 times) on the device
    printf("\nExtra memory devPointer takes %lu bytes or %lu kilo bytes \n\n", free_0 - free_1, (free_0 - free_1)/1000);

    test2<<<1,N>>>();
    cudaDeviceSynchronize();

    cudaFree(devData);															// free memory on the device since it is stored in devPointer

    cudaMemGetInfo( &free_2, &total );
    printf("\nFreed devData memory gives back %lu bytes or %lu kilo bytes \n\n", free_1 - free_2, (free_1 - free_2)/1000);

    return 0;
   }