ok, so the following works, but it is ugly. I am wondering if one can define a macro for templates (sorry I am not a c++ person).
The size of the binary is also 3 times larger than the program I posted in post #12 and it takes longer to compile.
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
template <unsigned int radius>
__global__ void demo(int * in, int * out, int width, int height) {
unsigned int rowIndex = blockIdx.x*blockDim.x + threadIdx.x;
int * row = &(in[rowIndex * width]);
if (rowIndex > height) return;
int buffer[radius];
int ii;
for (ii = 0; ii < radius; ii++)
buffer[ii] = ii + rowIndex;
for (ii = 0; ii < width; ii++) {
if (ii < radius) {
out[rowIndex * width + ii] = buffer[ii];
} else {
out[rowIndex * width + ii] = row[ii];
}
}
}
int main(void) {
int ii, jj;
int width = 8, height = 8;
int radius = 7;
int dim = 8;
int * in = (int *) malloc(width * height * sizeof(int));
int * out = (int *) malloc(width * height * sizeof(int));
int * d_in, * d_out;
cudaMalloc((void **) &d_in, width * height * sizeof(int));
cudaMalloc((void **) &d_out, width * height * sizeof(int));
for (ii = 0; ii < width*height; ii++)
in[ii] = 0;
cudaMemcpy(d_in, in, width * height * sizeof(int), cudaMemcpyHostToDevice);
dim3 blocksize(dim);
dim3 gridsize(height/blocksize.x);
switch (radius) {
case 0: return 0;
case 1: demo<1><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 2: demo<2><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 3: demo<3><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 4: demo<4><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 5: demo<6><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 7: demo<7><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 8: demo<8><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 9: demo<9><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 10: demo<10><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 11: demo<11><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 12: demo<12><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 13: demo<13><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 14: demo<14><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 15: demo<16><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 17: demo<17><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 18: demo<18><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 19: demo<19><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 20: demo<20><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 21: demo<21><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 22: demo<22><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 23: demo<23><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 24: demo<24><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 25: demo<26><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 27: demo<27><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 28: demo<28><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 29: demo<29><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 30: demo<30><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 31: demo<31><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 32: demo<32><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 33: demo<33><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 34: demo<34><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 35: demo<36><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 37: demo<37><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 38: demo<38><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 39: demo<39><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 40: demo<40><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 41: demo<41><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 42: demo<42><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 43: demo<43><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 44: demo<44><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 45: demo<46><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 47: demo<47><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 48: demo<48><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
case 49: demo<49><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;
}
cudaMemcpy(out, d_out, width * height * sizeof(int), cudaMemcpyDeviceToHost);
for (ii = 0; ii < height; ii++) {
for (jj = 0; jj < width; jj++) {
printf("%d\t", out[ii * width + jj]);
}
printf("\n");
}
return 0;
}