Copy entire struct to, and from the GPU

Hello, I have a simple struct that may look something like this:

template<class T>
struct Arr {
    T* data;
    unsigned int count;

    __host__ void MoveToDevice() {
        T* device; // Temporary memory buffer

	    COMPUTE_SAFE(cudaMalloc(&(device), count * sizeof(T)));
		COMPUTE_SAFE(cudaMemcpy(device, data, count * sizeof(T), cudaMemcpyHostToDevice));

		delete[] data; // Free host-side memory since we've copied it to the GPU
		data = device;
    }

    __host__ __device__ void MoveToHost() {
		T* host = new T[count]; // Temporary memory buffer

		COMPUTE_SAFE(cudaMemcpy(host, data, count * sizeof(T), cudaMemcpyDeviceToHost));
		COMPUTE_SAFE(cudaFree(data)); // Free device-side memory since we've copied it to the CPU

		data = host;
	}
}

The struct contains a function for moving the instance to, and from the GPU. the data pointer gets copied properly, but the count property does not - how can I copy the count property to and from the GPU on the same instance?

Thanks.

I’m adding another test case, showcasing what I’ve tried so far:

struct Test {
	int x, y;
	int* data;
};

static __global__  void TestKernel(Test* d) {
	d->x = 100;
	d->y = 2;

	printf("%d %d %d\n", d->data[0], d->data[1], d->data[2]);
}

void TestCUDA()
{
	Test* host = (Test*)malloc(sizeof(Test));
	host->x = 1;
	host->y = 2;
	host->data = new int[3];
	host->data[0] = 1000;
	host->data[1] = 2000;
	host->data[2] = 3000;

	// HOST -> DEVICE
	Test* device;
	COMPUTE_SAFE(cudaMalloc((void**)&device, sizeof(Test)));
	COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));

	COMPUTE_SAFE(cudaMalloc(&(device->data), 3 * sizeof(int))); // Error after calling this line
	COMPUTE_SAFE(cudaMemcpy(device->data, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));

	TestKernel << < 1, 1 >> > (device);
	COMPUTE_SAFE(cudaDeviceSynchronize());

	// DEVICE -> HOST
	COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
	COMPUTE_SAFE(cudaMemcpy(host->data, device->data, 3 * sizeof(int), cudaMemcpyDeviceToHost));

	printf("%d %d\n", host->x, host->y);
    printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);
}

Note that my code crashes on the COMPUTE_SAFE(cudaMalloc(&(device->data), 3 * sizeof(int))); line (Exception thrown at 0x00007FF857B13A50 (nvcuda64.dll) in VFD.exe: 0xC0000005: Access violation writing location 0x0000000B01600008) and no CUDA error is thrown (COMPUTE_SAFE is my error checking macro)

I’m adding yet another test case, this one gets a bit closer to the desired result.

Test* host = new Test{ 10, 20,new int[3]{1, 2, 3} };
Test* device = nullptr;
int* deviceData;

COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));

COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(float*), cudaMemcpyHostToDevice));

TestKernel <<< 1, 1 >>> (device);
COMPUTE_SAFE(cudaDeviceSynchronize());

COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(float), cudaMemcpyDeviceToHost));


printf("\nhost:\n");
printf("%d %d\n", host->x, host->y); // works
// printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);

Note that running the above code also results in the same crash at the last line has been commented out.