Hello,
I am allocating a chunk of memory on the device by using malloc from inside a kernel/device function.
But when i copy to that memory using cudaMemcpy, it fails.
To ease porting of existing code, it would be nicer if i could allocate memory via malloc and still copy to it via cudaMemcpy.
Here is the reproducer:
#include <stdio.h>
#define dugCudaCheck(result)
do {
cudaError_t status = (result);
if (status != cudaSuccess) {
fprintf(stderr, “%s#%d: %s\n”, FILE, LINE, cudaGetErrorString(status));
exit(status);
}
} while(0)
global void cuda_hello(){
printf(“Hello World from GPU\n”);
}
global void device_alloc(unsigned long* addr) {
char* a = (char*) malloc(1 * sizeof(char));
*a = ‘d’;
addr[0] = (unsigned long) a;
}
global void print_data(unsigned long addr) {
char c = (char)addr;
printf(“print_data: char is %c\n”, *c);
}
global void print_data_by_pointer(char* c) {
printf(“print_data_by_pointer: char is %c\n”, *c);
}
int main() {
cudaSetDevice(0);
cuda_hello<<<1,1>>>();
cudaDeviceSynchronize();
unsigned long* addr;
dugCudaCheck(cudaMalloc((void**)&addr, sizeof(unsigned long)));
device_alloc<<<1,1>>>(addr);
unsigned long addr_host[1];
dugCudaCheck(cudaMemcpy(addr_host, addr, sizeof(unsigned long), cudaMemcpyDeviceToHost));
printf("CUDA addr = %p\n", addr_host[0]);
printf("Lets check we can access this address on device and print data\n");
print_data<<<1,1>>>(addr_host[0]);
char* c_dev = (char*) addr_host[0];
print_data_by_pointer<<<1,1>>>(c_dev);
cudaDeviceSynchronize();
char c[1], c2[1];
char* c_dev2;
dugCudaCheck(cudaMalloc((void**)&c_dev2, sizeof(char)));
printf("Trying to cudaMemcpy from pointer allocated from cudaMalloc\n");
dugCudaCheck(cudaMemcpy(c2, c_dev2, sizeof(char), cudaMemcpyDeviceToHost));
printf("SUCCESS\n");
printf("Trying to cudaMemcpy from pointer allocated form malloc\n");
dugCudaCheck(cudaMemcpy(c, c_dev, sizeof(char), cudaMemcpyDeviceToHost));
printf("SUCCESS\n");
dugCudaCheck(cudaFree(addr));
cudaDeviceSynchronize();
}
After compiling it with nvcc mmcp.cu and running i get: mmcp.cu#61: invalid argument
I understand that this might be due to the fact that cudaMemcpy can only operate on memory that is known to it (by cudaMalloc) but is there a way to register the device memory range after the fact?
Regards.
Jacek Tomaka