Hello, I have a simple struct that may look something like this:
template<class T>
struct Arr {
T* data;
unsigned int count;
__host__ void MoveToDevice() {
T* device; // Temporary memory buffer
COMPUTE_SAFE(cudaMalloc(&(device), count * sizeof(T)));
COMPUTE_SAFE(cudaMemcpy(device, data, count * sizeof(T), cudaMemcpyHostToDevice));
delete[] data; // Free host-side memory since we've copied it to the GPU
data = device;
}
__host__ __device__ void MoveToHost() {
T* host = new T[count]; // Temporary memory buffer
COMPUTE_SAFE(cudaMemcpy(host, data, count * sizeof(T), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaFree(data)); // Free device-side memory since we've copied it to the CPU
data = host;
}
}
The struct contains a function for moving the instance to, and from the GPU. the data
pointer gets copied properly, but the count
property does not - how can I copy the count
property to and from the GPU on the same instance?
Thanks.
I’m adding another test case, showcasing what I’ve tried so far:
struct Test {
int x, y;
int* data;
};
static __global__ void TestKernel(Test* d) {
d->x = 100;
d->y = 2;
printf("%d %d %d\n", d->data[0], d->data[1], d->data[2]);
}
void TestCUDA()
{
Test* host = (Test*)malloc(sizeof(Test));
host->x = 1;
host->y = 2;
host->data = new int[3];
host->data[0] = 1000;
host->data[1] = 2000;
host->data[2] = 3000;
// HOST -> DEVICE
Test* device;
COMPUTE_SAFE(cudaMalloc((void**)&device, sizeof(Test)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMalloc(&(device->data), 3 * sizeof(int))); // Error after calling this line
COMPUTE_SAFE(cudaMemcpy(device->data, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
TestKernel << < 1, 1 >> > (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
// DEVICE -> HOST
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaMemcpy(host->data, device->data, 3 * sizeof(int), cudaMemcpyDeviceToHost));
printf("%d %d\n", host->x, host->y);
printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);
}
Note that my code crashes on the COMPUTE_SAFE(cudaMalloc(&(device->data), 3 * sizeof(int)));
line (Exception thrown at 0x00007FF857B13A50 (nvcuda64.dll) in VFD.exe: 0xC0000005: Access violation writing location 0x0000000B01600008
) and no CUDA error is thrown (COMPUTE_SAFE
is my error checking macro)
I’m adding yet another test case, this one gets a bit closer to the desired result.
Test* host = new Test{ 10, 20,new int[3]{1, 2, 3} };
Test* device = nullptr;
int* deviceData;
COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(float*), cudaMemcpyHostToDevice));
TestKernel <<< 1, 1 >>> (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(float), cudaMemcpyDeviceToHost));
printf("\nhost:\n");
printf("%d %d\n", host->x, host->y); // works
// printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);
Note that running the above code also results in the same crash at the last line has been commented out.