Yes. This is a viable option for you, I suppose. Here is a full example so as not to leave out any details. Be careful doing this, it’s easy to shoot yourself in the foot. As a rule of thumb, the only kinds of pointers that work with this are generic data types (float, int, etc). This is not actually true, but just be careful!
If you called the file pod.cu, you would compile with nvcc -std=c++11 -o pod_test pod.cu and run ./pod_test. Note that c++11 really isn’t necessary, it’s just for nullptr and constexpr. Just make sure you are ALWAYS initializing your pointers to nullptr or NULL to save yourself future headache!
struct Geometry {
float *data = nullptr;
size_t size = 0;
};
// Note: these are *NOT* references
__global__ void munge(Geometry geometry1, Geometry geometry2) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// set all g1 to 1
if (idx < geometry1.size)
geometry1.data[idx] = 1.0f;
// set all g2 to 2
if (idx < geometry2.size)
geometry2.data[idx] = 2.0f;
}
#include <iostream>
int main(void) {
static constexpr size_t N = 20;
// Create / allocate g1, set to 0 to verify kernel works
Geometry g1;
cudaMalloc(&g1.data, N * sizeof(float));
cudaMemset(g1.data, 0, N * sizeof(float));
g1.size = N;
// Same for g2
Geometry g2;
cudaMalloc(&g2.data, N * sizeof(float));
cudaMemset(g2.data, 0, N * sizeof(float));
g2.size = N;
// create some host buffers to verify things
float *h_g1_data = (float *)malloc(N * sizeof(float));
float *h_g2_data = (float *)malloc(N * sizeof(float));
// copy down and make sure they are zero
cudaMemcpy(h_g1_data, g1.data, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_g2_data, g2.data, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// tally the number of zeros we found for both
size_t n_g1_zero = 0;
size_t n_g2_zero = 0;
for (size_t i = 0; i < N; ++i) {
if (h_g1_data[i] == 0.0f) ++n_g1_zero;
if (h_g2_data[i] == 0.0f) ++n_g2_zero;
}
std::cout << "Number of zeros:" << std::endl
<< " - G1: " << n_g1_zero << std::endl
<< " - G2: " << n_g2_zero << std::endl << std::endl;
// now we just call our kernel, g1 and g2 are POD because all that is
// happenning is we are copying the pointer location and the size.
// uncomment this line to verify that size is getting copied correctly,
// by doing this we only do half the problem for g2
// g2.size = N / 2;
munge<<< N , 1 >>>(g1, g2);// only doing this because 20 fits in one warp
cudaDeviceSynchronize();
// copy back and verify
cudaMemcpy(h_g1_data, g1.data, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_g2_data, g2.data, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
size_t n_g1_one = 0;
size_t n_g2_two = 0;
for (size_t i = 0; i < N; ++i) {
if (h_g1_data[i] == 1.0f) ++n_g1_one;
if (h_g2_data[i] == 2.0f) ++n_g2_two;
}
std::cout << "Number of G1 were 1: " << n_g1_one << std::endl
<< "Number of G2 were 2: " << n_g2_two << std::endl;
// free up the data we used
cudaFree(g1.data);
cudaFree(g2.data);
free(h_g1_data);
free(h_g2_data);
return 0;
}
So it works fine, and may be useful, but for this problem I personally would just change the signature of munge to be
__global__ void munge(float * __restrict__ g1_data, size_t g1_size,
float * __restrict__ g2_data, size_t g2_size) { /* ... */ }
if you’re really that worried about aliasing here. If I recall correctly, in c++ restrict is not equivalent to restrict in C. I don’t think it necessarily always guarantees strict aliasing the way it does in C. But I’m far from an authority, I just vaguely recall this being the case.
Edit: my memory seems correct, it’s not officially standardized, but it seems most compilers will support it What does the restrict keyword mean in C++? - Stack Overflow