Hi,
I have been working on creating a class that would handle an array of data. The class has an overloaded () operator to carry out the array indexing. Since array indexing relies on the array dimensions (NX & NY), and these dimensions are constant, it would be beneficial to store the dimension values in the device constant memory instead of the global memory. The code below is supposedly designed to do so. However, when the code is profiled, it seems the values are loaded from the global memory instead of the constant memory.
For example, in the code below, the class “Foo” is used to create, free, copy and access an array members (array, d_array) on host and device sides. The dimensions of the array are copied to the device constant memory and stored in (nx & ny). The array index is evaluated properly and retrieves the correct data. However, when I profiled the code using Nsight Compute (attached file), the overloaded () operator is carrying out 3 loads from the global memory, instead of 2 loads from constant memory (nx & ny) and 1 load from global memory (d_array).
Any suggestion how to make the overloaded operator() load the (nx & ny) values from constant memory instead of global memory ?
Thanks
Mahmoud
class_constMem_1.ncu-rep (207.5 KB)
#include <iostream>
#include <algorithm>
#ifdef _WIN64
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#endif
using namespace std;
__constant__ int nx, ny;
struct Foo {
Foo(int NX, int NY, int NZ) {
arr = new float[NX * NY * NZ]; /* Allocate memory on host side */
for (int i = 0; i != NX * NY * NZ; ++i) { /* Fill the memory on the host side with data */
arr[i] = float(i);
}
cudaMalloc(&d_arr, NX * NY * NZ * sizeof(float)); /* Allocate memory on the device side*/
cudaMemcpy(d_arr, arr, NX * NY * NZ * sizeof(float), cudaMemcpyHostToDevice); /* Copy data from host side to device side */
}
~Foo() {
delete[] arr; /* Free memory on the host side */
cudaFree(d_arr); /* Free memory on the device side */
}
__device__ inline float& operator()(int x, int y, int z) { return d_arr[x + nx * (y + ny * (z))]; } /* Overloaded operator for loading data */
float* arr;
float* d_arr;
};
__global__ void cuda_kernel(Foo* obj) {
printf("loaded value from device side = %f\n", (*obj)(1, 1, 1));
}
int main() {
int NX = 10, NY = 10, NZ = 10; /* Array dimensions */
Foo h_obj(NX, NY, NZ); /* Create a class object on the host side */
cout << "Loaded value from host side = " << h_obj.arr[1 + 10 * (1 + 10 * 1)] << endl; /* Test: Load a value from array on the host side */
Foo* d_obj; /* Create a device pointer to a class objet */
cudaMalloc(&d_obj, sizeof(Foo)); /* Allocate memory for the device pointer */
cudaMemcpy(d_obj, &h_obj, sizeof(Foo), cudaMemcpyHostToDevice); /* Copy the object pointer from host to device */
cudaMemcpyToSymbol(nx, &NX, sizeof(int)); /* Copy data to device constant memory */
cudaMemcpyToSymbol(ny, &NY, sizeof(int)); /* Copy data to device constant memory */
cuda_kernel << <1, 1 >> > (d_obj); cudaDeviceSynchronize(); cudaPeekAtLastError(); /* Test kernel*/
cout << "Program end!" << endl;
return 0;
}