Is it possible for cuda C/C++ kernel to get access to an array (allocated by cudaMalloc) without specifying at the kernel prototype?

Hi all, question as the topic. Here I illustrate my question with the following example.
I would like to rewrite the function test_fun in the following code as a kernel.
test_fun take an input double array ‘a’ specified in prototype, multiply it element-wise
with another double array ‘c’ which was evaluated/initialized at runtime before calling test_fun,
and ‘c’ is not supposed to be changed in test_fun.
The result is stored in ‘a’. In this example, a[i]=1, c[i]=2, after calling test_fun, a[i]=2 for all i.
(compiled with g++)

#include <stdlib.h>
#include <string.h>
double *c_d;

void test_fun(double* a_tmp, int mx) {
for (int i=0; i<mx; ++i)  {
  a_tmp[i]=a_tmp[i]*c_d[i];
}
}

int main ()  {
int blocksize=8;
int nx=2;
double* a;
double* a_d;
double* c;

a=(double*) malloc(nx*sizeof(double) );
c=(double*) malloc(nx*sizeof(double) );
a_d=(double*) malloc(nx*sizeof(double) );
c_d=(double*) malloc(nx*sizeof(double) );

//Initialize
for (int i=0; i<nx; ++i){
    a[i]=1;
    c[i]=2;
}

memcpy(a_d, a, nx*sizeof(double));
memcpy(c_d, c, nx*sizeof(double));

for (int i=0; i<nx; ++i) {
 printf("Before func, a[%d]=%lf\n", i, a[i]);
}

//call func
test_fun(a_d, nx);

memcpy(a, a_d, nx*sizeof(double));

for (int i=0; i<nx; ++i) {
 printf("After func, a[%d]=%lf\n", i, a[i]);
}
free(a_d);
free(c_d);
free(a);
free(c);
return 0;
}

The following is my attempt to write the kernel. First of all, I found that I could not simply keep double *c_d; just after the include header, a compilation error said the device code cannot get access to c_d.
I am able to get it compiled by adding the global attribute. However, the result is wrong. After calling the kernel, a[i] remains 1 for all i.
(compiled with nvc++)

#include <stdlib.h>
__global__ double *c_d;

__global__ void test_ker(double* a_dev, int mx) {
int i=blockIdx.x*blockDim.x+threadIdx.x;
if (i<mx)  {
  a_dev[i]=a_dev[i]*c_d[i];
}

}


int main ()  {
int blocksize=8;
int nx=2;
double* a;
double* a_d;
double* c;


int dimx=nx%blocksize==0? nx/blocksize:nx/blocksize+1;

a=(double*) malloc(nx*sizeof(double) );
c=(double*) malloc(nx*sizeof(double) );

cudaMalloc(&a_d, nx*sizeof(double));
cudaMalloc(&c_d, nx*sizeof(double));

//Initialize
for (int i=0; i<nx; ++i){
    a[i]=1;
    c[i]=2;
}


//define thread grid
dim3 gridDim(dimx, 1, 1);
dim3 blockDim(blocksize, 1,1);

cudaMemcpy(a_d, a, nx*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(c_d, c, nx*sizeof(double), cudaMemcpyHostToDevice);

for (int i=0; i<nx; ++i) {
 printf("Before ker, a[%d]=%lf\n", i, a[i]);
}

//call kernel
test_ker<<<gridDim,blockDim>>>(a_d, nx);

cudaDeviceSynchronize();

cudaMemcpy(a, a_d, nx*sizeof(double), cudaMemcpyDeviceToHost);

for (int i=0; i<nx; ++i) {
 printf("After ker, a[%d]=%lf\n", i, a[i]);
}
cudaFree(a_d);
cudaFree(c_d);
free(a);
free(c);
return 0;
}

Anything I am missing? Or it is a must to specify array ‘c’ in the prototype of the kernel (I tried and it works but I would like to have the prototype to look like the original test_fun)?

Thanks for reading my question.

Yes, its possible. Basically I think you are asking for a device-allocated global-scope variable. In CUDA the usual decorator for that is __device__. Global scope variables like this must have a size determined at compile-time. Therefore as you have indicated, you can work around this by having a global-scope pointer variable, then allocate that dynamically. The following changes should get you most of the way there. I haven’t tested it, just written the changes in the browser.

#include <stdlib.h>
__device__ double *c_d;

__global__ void test_ker(double* a_dev, int mx) {
int i=blockIdx.x*blockDim.x+threadIdx.x;
if (i<mx)  {
  a_dev[i]=a_dev[i]*c_d[i];
}

}


int main ()  {
int blocksize=8;
int nx=2;
double* a;
double* a_d;
double* c;
double *c_d_temp;

int dimx=nx%blocksize==0? nx/blocksize:nx/blocksize+1;

a=(double*) malloc(nx*sizeof(double) );
c=(double*) malloc(nx*sizeof(double) );

cudaMalloc(&a_d, nx*sizeof(double));
cudaMalloc(&c_d_temp, nx*sizeof(double));
cudaMemcpyToSymbol(c_d, &c_d_temp, sizeof(double *));

//Initialize
for (int i=0; i<nx; ++i){
    a[i]=1;
    c[i]=2;
}


//define thread grid
dim3 gridDim(dimx, 1, 1);
dim3 blockDim(blocksize, 1,1);

cudaMemcpy(a_d, a, nx*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(c_d_temp, c, nx*sizeof(double), cudaMemcpyHostToDevice);

for (int i=0; i<nx; ++i) {
 printf("Before ker, a[%d]=%lf\n", i, a[i]);
}

//call kernel
test_ker<<<gridDim,blockDim>>>(a_d, nx);

cudaDeviceSynchronize();

cudaMemcpy(a, a_d, nx*sizeof(double), cudaMemcpyDeviceToHost);

for (int i=0; i<nx; ++i) {
 printf("After ker, a[%d]=%lf\n", i, a[i]);
}
cudaFree(a_d);
cudaFree(c_d);
free(a);
free(c);
return 0;
}
1 Like

Thanks a lot Robert.
I tested it and it works.
I don’t know this trick and once again thank you.