Hello everybody,
I am pretty new to CUDA and would appreciate if you could have a quick look at the following code to tell me whether it is possible what I am trying to do:
In the hope to make the compiler unroll several for-loops in the original program, I defined structs with static data members for use as template arguments in the CUDA kernel, similar to the following (almost…) minimal example.
[codebox]
#include “stdio.h”
namespace Hello
{
struct CUDAStruct
{
static const double a = 25.;
static const double b[3];
};
}
const double Hello::CUDAStruct::b[3] = {1., 2., 3.};
template
global void gpuKernel( double* in, double* out)
{
uint idx=blockIdx.x*blockDim.x+threadIdx.x;
out[idx] = in[idx] + S::a;
in[idx] = S::b[1];
}
/*
global void gpuKernel( double* in, double* out)
{
uint idx=blockIdx.x*blockDim.x+threadIdx.x;
out[idx] = in[idx] + 52.;
in[idx] = -2.;
}
*/
int main()
{
double *in, *out;
double *d_in, *d_out;
uint N=16;
uint i;
in = (double*) malloc(N*sizeof(double));
out = (double*) malloc(N*sizeof(double));
cudaMalloc((void **) &d_in, sizeof(double)*N);
cudaMalloc((void **) &d_out, sizeof(double)*N);
for(i=0; i<N; ++i) {in[i] = (double) i;}
cudaMemcpy(d_in, in, sizeof(double)*N, cudaMemcpyHostToDevice);
dim3 dimBlock(16,1);
dim3 dimGrid(N/dimBlock.x,1);
gpuKernel<Hello::CUDAStruct><<<dimGrid,dimBlock>>>(d_in, d_out);
// gpuKernel<<<dimGrid,dimBlock>>>(d_in, d_out);
cudaMemcpy(in, d_in, sizeof(double)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(out, d_out, sizeof(double)*N, cudaMemcpyDeviceToHost);
for(i=0; i<N; ++i) {printf(" in[%2d]=%9f, out[%2d]=%9f\n", i,in[i],i,out[i]);}
free(in); free(out);
cudaFree(d_in); cudaFree(d_out);
return 0;
}[/codebox]
Trying to compile this (with release 3.0, V0.2.1221 under Ubuntu 10.04 x64) yields
CUDA > nvcc -arch=sm_13 -DSM13 template_test.cu -o template_test
template_test.cu(20): error: identifier "_ZN5Hello10CUDAStruct1bE" is undefined
1 error detected in the compilation of "/tmp/tmpxft_00006a7c_00000000-5_template_test.cpp2.i".
or in emulation mode (I do not understand why it says it was not even declared here)
CUDA > nvcc -arch=sm_13 -DSM13 -deviceemu template_test.cu -o template_test
In file included from /tmp/tmpxft_00006a54_00000000-1_template_test.cudafe1.stub.c:2,
from /tmp/tmpxft_00006a54_00000000-1_template_test.cudafe1.stub.c:2:
template_test.cu: In function ‘void __cuda_emu::_Z9gpuKernelIN5Hello10CUDAStructEEvPdS2_(double*, double*)’:
template_test.cu:20: error: ‘_ZN5Hello10CUDAStruct1bE’ was not declared in this scope
With S::b commented out (or using the non-templated kernel that is currently commented out), everything works fine.
Is it possible to use relatively complex template arguments like the array as kernel template arguments?
Thanks for reading this, comments are welcome!