Is there a way to keep around device pointers to memory allocated already in the GPU memory? The problem I am having is this: I have a C++ program which is my top level program. I make calls to a my_code.cu file. I want to have one function in this my_code.cu file that does all the device allocation and copying of initial parameters. After which, it launches my CUDA kernel, then returns back to my main C++ program. However, on subsequent calls, I would like to SKIP those previous steps of malloc and copy to the device. If possible, I’d like to just pass those memory locations that I already cudamalloc-ed on the device around so I can skip those steps. Has anyone successfully done this? Thanks!
I would suggest using the functionality of C++ on the host side, using an object to wrap your function. Construct the object with the initial parameters, and the constructor can allocate the device memory and keep track of it as part of the object state. Then invoke a method on the object to invoke the kernel. The object destructor can free the device memory.
you can do that, I have done it.
Your cuda function that allocate memory should return a device pointer, and you can store (but of course not use) this pointer in C++ program. Then you just have to call cuda function for kernel with the pointer as parameter.
Jamie k method is well object oriented, I think it should work, but I’ve never tried that way.
you can do that, I have done it.
Your cuda function that allocate memory should return a device pointer, and you can store (but of course not use) this pointer in C++ program. Then you just have to call cuda function for kernel with the pointer as parameter.
Jamie k method is well object oriented, I think it should work, but I’ve never tried that way.
thedeusirae, would you mind posting some code that does this? I am trying to do exactly what you are describing, but on run time, I get an error “invalid device pointer” when I do a single cudamalloc and pass the pointer around. Thanks!
Here is a test code. The aim of this code is to copy a matrix (int**) from the host to the device. Since the host matrix could be really huge, I wanted to find a way to copy it line by line, so I don’t have to allocate the entire matrix on the host.
Also, it lacks something in the code : don’t forget to free memory !
[codebox]include <stdio.h>
include <windows.h>
int ** cudaGetPointeur(int * line, int ** struct_ptr, int linesize, int linenumber)
{
if(struct_ptr==NULL)
{
printf("\n");
int * line_device;
cudaMalloc((void**)&line_device, linesize*sizeof(int));
cudaMemcpy(line_device, line, linesize*sizeof(int), cudaMemcpyHostToDevice);
int ** h_struct=(int**)malloc(sizeof(int*));
int ** d_struct;
cudaMalloc((void**)&d_struct, sizeof(int*));
h_struct[0]=line_device;
cudaMemcpy(d_struct, h_struct, sizeof(int*), cudaMemcpyHostToDevice);
return d_struct;
}
else
{
int ** h_struct_old=(int**)malloc(linenumber*sizeof(int*));
cudaMemcpy(h_struct_old, struct_ptr, linenumber*sizeof(int*), cudaMemcpyDeviceToHost);
int ** h_struct_new=(int**)malloc((linenumber+1)sizeof(int));
for(int i=0;i<linenumber;i++)
{
h_struct_new[i]=h_struct_old[i];
}
int * line_device;
cudaMalloc((void**)&line_device, linesize*sizeof(int));
cudaMemcpy(line_device, line, linesize*sizeof(int), cudaMemcpyHostToDevice);
h_struct_new[linenumber]=line_device;
int ** d_struct;
cudaMalloc((void**)&d_struct,(linenumber+1)*sizeof(int*));
cudaMemcpy(d_struct, h_struct_new, (linenumber+1)*sizeof(int*), cudaMemcpyHostToDevice);
return d_struct;
}
return NULL;
}
void readMatrice(int ** struct_ptr, int linesize, int linenumber)
{
int ** mat=struct_ptr;
int ** result=(int**)malloc(linenumber * sizeof(int*));
cudaMemcpy(resultat, mat, linenumber * sizeof(int*), cudaMemcpyDeviceToHost);
for(int i=0;i<linenumber;i++)
{
int * arr=(int*)malloc(linesize*sizeof(int));
cudaMemcpy(arr, result[i], linesize * sizeof(int), cudaMemcpyDeviceToHost);
for(int j=0;j<linesize;j++)
printf("%d ",arr[j]);
printf("\n");
free(arr);
}
}
int main()
{
int ** mat=new int*[5];
for(int i=0;i<5;i++)
{
mat[i]=new int[10];
for (int j=0;j<10;j++)
{
mat[i][j]=j+10*i;
}
}
for(int i=0;i<5;i++)
{
for (int j=0;j<10;j++)
{
printf("%d ",mat[i][j]);
}
printf("\n");
}
int ** ptr=NULL;
for(int i=0;i<5;i++)
{
ptr=cudaGetPointeur(mat[i],ptr,10, i);
}
readMatrice(ptr, 10, 5);
system(“pause”);
return 0;
}[/codebox]