Hi all, question as the topic. Here I illustrate my question with the following example.
I would like to rewrite the function test_fun in the following code as a kernel.
test_fun take an input double array ‘a’ specified in prototype, multiply it element-wise
with another double array ‘c’ which was evaluated/initialized at runtime before calling test_fun,
and ‘c’ is not supposed to be changed in test_fun.
The result is stored in ‘a’. In this example, a[i]=1, c[i]=2, after calling test_fun, a[i]=2 for all i.
(compiled with g++)
#include <stdlib.h>
#include <string.h>
double *c_d;
void test_fun(double* a_tmp, int mx) {
for (int i=0; i<mx; ++i) {
a_tmp[i]=a_tmp[i]*c_d[i];
}
}
int main () {
int blocksize=8;
int nx=2;
double* a;
double* a_d;
double* c;
a=(double*) malloc(nx*sizeof(double) );
c=(double*) malloc(nx*sizeof(double) );
a_d=(double*) malloc(nx*sizeof(double) );
c_d=(double*) malloc(nx*sizeof(double) );
//Initialize
for (int i=0; i<nx; ++i){
a[i]=1;
c[i]=2;
}
memcpy(a_d, a, nx*sizeof(double));
memcpy(c_d, c, nx*sizeof(double));
for (int i=0; i<nx; ++i) {
printf("Before func, a[%d]=%lf\n", i, a[i]);
}
//call func
test_fun(a_d, nx);
memcpy(a, a_d, nx*sizeof(double));
for (int i=0; i<nx; ++i) {
printf("After func, a[%d]=%lf\n", i, a[i]);
}
free(a_d);
free(c_d);
free(a);
free(c);
return 0;
}
The following is my attempt to write the kernel. First of all, I found that I could not simply keep double *c_d;
just after the include header, a compilation error said the device code cannot get access to c_d.
I am able to get it compiled by adding the global attribute. However, the result is wrong. After calling the kernel, a[i] remains 1 for all i.
(compiled with nvc++)
#include <stdlib.h>
__global__ double *c_d;
__global__ void test_ker(double* a_dev, int mx) {
int i=blockIdx.x*blockDim.x+threadIdx.x;
if (i<mx) {
a_dev[i]=a_dev[i]*c_d[i];
}
}
int main () {
int blocksize=8;
int nx=2;
double* a;
double* a_d;
double* c;
int dimx=nx%blocksize==0? nx/blocksize:nx/blocksize+1;
a=(double*) malloc(nx*sizeof(double) );
c=(double*) malloc(nx*sizeof(double) );
cudaMalloc(&a_d, nx*sizeof(double));
cudaMalloc(&c_d, nx*sizeof(double));
//Initialize
for (int i=0; i<nx; ++i){
a[i]=1;
c[i]=2;
}
//define thread grid
dim3 gridDim(dimx, 1, 1);
dim3 blockDim(blocksize, 1,1);
cudaMemcpy(a_d, a, nx*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(c_d, c, nx*sizeof(double), cudaMemcpyHostToDevice);
for (int i=0; i<nx; ++i) {
printf("Before ker, a[%d]=%lf\n", i, a[i]);
}
//call kernel
test_ker<<<gridDim,blockDim>>>(a_d, nx);
cudaDeviceSynchronize();
cudaMemcpy(a, a_d, nx*sizeof(double), cudaMemcpyDeviceToHost);
for (int i=0; i<nx; ++i) {
printf("After ker, a[%d]=%lf\n", i, a[i]);
}
cudaFree(a_d);
cudaFree(c_d);
free(a);
free(c);
return 0;
}
Anything I am missing? Or it is a must to specify array ‘c’ in the prototype of the kernel (I tried and it works but I would like to have the prototype to look like the original test_fun)?
Thanks for reading my question.