Defining Array in Kernel

Am I missing something, why wouldn’t the following work

__global__ void my_kernel(int3 * in, int3 * out, int radius) {

				

	int dx = blockDim.x;

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	

	int3 buffer[DIM + 2*radius];

}

Compiling gives the following error

CUDA_NVCC_HOST_COMPILER_FLAGS = 

nvcc_host_compiler_flags = "-Xcompiler;,"-g""

Signal: Segmentation fault in Code_Expansion phase.

<input>(0): Error: Signal Segmentation fault in phase Code_Expansion -- processing aborted

*** Internal stack backtrace:

	/usr/local/cuda/open64/lib//be [0x82f1cde]

	/usr/local/cuda/open64/lib//be [0x82f26bc]

	/usr/local/cuda/open64/lib//be [0x82f2a48]

	/usr/local/cuda/open64/lib//be [0x82f1f2f]

	/usr/local/cuda/open64/lib//be [0x82f2cee]

	[0x4001e400]

	/usr/local/cuda/open64/lib//be [0x81a5d73]

	/usr/local/cuda/open64/lib//be [0x81a2a3c]

	/usr/local/cuda/open64/lib//be [0x81a3cfb]

	/usr/local/cuda/open64/lib//be [0x81a2d63]

	/usr/local/cuda/open64/lib//be [0x81a7555]

	/usr/local/cuda/open64/lib//be [0x81a817c]

	/usr/local/cuda/open64/lib//be [0x81a86e5]

	/usr/local/cuda/open64/lib//be [0x8184f55]

	/usr/local/cuda/open64/lib//be [0x804d0d5]

	/usr/local/cuda/open64/lib//be [0x804de6b]

	/usr/local/cuda/open64/lib//be [0x804f1d3]

	/lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xe5) [0x4017d775]

	/usr/local/cuda/open64/lib//be [0x804b655]

nvopencc INTERNAL ERROR: /usr/local/cuda/open64/lib//be died due to signal 4

Thank you

No such thing as arrays which are sized at runtime in C/C++, I am afraid.

I am sorry avidday, but what does that mean. Clearly you can do the following in c

void my_c_kernel( int radius) {			

	int buffer[2*radius];

}

does cuda add a restriction?

It is not supported at all in c++, and I don’t know what is the standard in C for variable lenght array (I know it is working on most compiler, but I am not sure it is part of the standard thought).

Is there a way around that? define dynamic array inside a kernel?

Repeat after me: kernels cannot allocate memory. This is a recurring theme in CUDA!

However, the compiler should not have died like it did - your code may have been invalid, but nvcc should have produced a useful error message, not an internal compiler error. I’d submit it as a bug report.

A quick test with gcc suggests that dynamic local arrays were added in C99.

I am wondering if there is a way around that — even if it an ugly hack. would the following work

__global__ void my_kernel(int3 * in, int3 * out, int radius) {

				

	int dx = blockDim.x;

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	

	int3 *buffer

	if (radius == 2) {

	   int3 __buffer[DIM + 4];

	   buffer = __buffer;

	} else if (radius == 3) {

	   int3 __buffer[DIM + 6];

	   buffer = __buffer;

	} etc...

}

I am wondering at what point is the memory for __buffer collected

You can’t do that in C90 and you can’t do it in C++ either.

Huh, apparently C99’s dynamic arrays may be stored on either the stack or the heap. Learn something new every day…

compiles and runs with

gcc -std=c89

the following also compiles and runs:

#include <stdio.h>

int f() {

  int u;

  scanf("%d", &u);

  int k[u];

  k[u-1] = u;

  return k[u-1];

}

int main(void) {

  int k = f();

  printf("%d", k);

  return 0;

}

as far as my little hack, the program works fine, but what I am doing should be undefined behavior — right? I have tried it with radius up to 80.

I don’t know why gcc does that, because it shouldn’t. The C90 standard says that

The Sun Studio compiler doesn’t:

kuusi2:(~)(57)% cc junk.c -xc99=none -c -o junk.o

"junk.c", line 6: integral constant expression expected

"junk.c", line 6: warning: declaration can not follow a statement

cc: acomp failed for junk.c

kuusi2:(~)(58)% cat junk.c

#include <stdio.h>

int f() {

  int u;

  scanf("%d", &u);

  int k[u];

  k[u-1] = u;

  return k[u-1];

}

int main(void) {

  int k = f();

  printf("%d", k);

  return 0;

}

and neither does the Intel compiler:

[david@lattice ~]$ icc -std=c89 -c -o junk.o junk.c

junk.c(6): error: declaration may not appear after executable statement in block

	int k[u];

	^

compilation aborted for junk.c (code 2)

the c program works in llvm using llvm-gcc, but that was not my original question. My question is would something like the following always work:

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

__global__ void demo(int * in, int * out, int width, int height, int radius) {

	unsigned int rowIndex = blockIdx.x*blockDim.x + threadIdx.x;

	int * row = &(in[rowIndex * width]);

	if (rowIndex > height) return;

	int * buffer;

	if	  (radius == 0)   { return;							   }

	else if (radius == 1)   { int __buffer[1];	buffer = __buffer; }

	else if (radius == 2)   { int __buffer[2];	buffer = __buffer; }

	else if (radius == 3)   { int __buffer[3];	buffer = __buffer; }

	else if (radius == 4)   { int __buffer[4];	buffer = __buffer; }

	else if (radius == 5)   { int __buffer[5];	buffer = __buffer; }

	else if (radius == 6)   { int __buffer[6];	buffer = __buffer; }

	else if (radius == 7)   { int __buffer[7];	buffer = __buffer; }

	else if (radius == 8)   { int __buffer[8];	buffer = __buffer; }

	else if (radius == 9)   { int __buffer[9];	buffer = __buffer; }

	else if (radius == 10)  { int __buffer[10];   buffer = __buffer; }

	else if (radius == 11)  { int __buffer[11];   buffer = __buffer; }

	else if (radius == 12)  { int __buffer[12];   buffer = __buffer; }

	else if (radius == 13)  { int __buffer[13];   buffer = __buffer; }

	else if (radius == 14)  { int __buffer[14];   buffer = __buffer; }

	else if (radius == 15)  { int __buffer[15];   buffer = __buffer; }

	else if (radius == 16)  { int __buffer[16];   buffer = __buffer; }

	else if (radius == 17)  { int __buffer[17];   buffer = __buffer; }

	else if (radius == 18)  { int __buffer[18];   buffer = __buffer; }

	else if (radius == 19)  { int __buffer[19];   buffer = __buffer; }

	else if (radius == 20)  { int __buffer[20];   buffer = __buffer; }

	else if (radius == 21)  { int __buffer[21];   buffer = __buffer; }

	else if (radius == 22)  { int __buffer[22];   buffer = __buffer; }

	else if (radius == 23)  { int __buffer[23];   buffer = __buffer; }

	else if (radius == 24)  { int __buffer[24];   buffer = __buffer; }

	else if (radius == 25)  { int __buffer[25];   buffer = __buffer; }

	else if (radius == 26)  { int __buffer[26];   buffer = __buffer; }

	else if (radius == 27)  { int __buffer[27];   buffer = __buffer; }

	else if (radius == 28)  { int __buffer[28];   buffer = __buffer; }

	else if (radius == 29)  { int __buffer[29];   buffer = __buffer; }

	else if (radius == 30)  { int __buffer[30];   buffer = __buffer; }

	else if (radius == 31)  { int __buffer[31];   buffer = __buffer; }

	else if (radius == 32)  { int __buffer[32];   buffer = __buffer; }

	else if (radius == 33)  { int __buffer[33];   buffer = __buffer; }

	else if (radius == 34)  { int __buffer[34];   buffer = __buffer; }

	else if (radius == 35)  { int __buffer[35];   buffer = __buffer; }

	else if (radius == 36)  { int __buffer[36];   buffer = __buffer; }

	else if (radius == 37)  { int __buffer[37];   buffer = __buffer; }

	else if (radius == 38)  { int __buffer[38];   buffer = __buffer; }

	else if (radius == 39)  { int __buffer[39];   buffer = __buffer; }

	else if (radius == 40)  { int __buffer[40];   buffer = __buffer; }

	else if (radius == 41)  { int __buffer[41];   buffer = __buffer; }

	else if (radius == 42)  { int __buffer[42];   buffer = __buffer; }

	else if (radius == 43)  { int __buffer[43];   buffer = __buffer; }

	else if (radius == 44)  { int __buffer[44];   buffer = __buffer; }

	else if (radius == 45)  { int __buffer[45];   buffer = __buffer; }

	else if (radius == 46)  { int __buffer[46];   buffer = __buffer; }

	else if (radius == 47)  { int __buffer[47];   buffer = __buffer; }

	else if (radius == 48)  { int __buffer[48];   buffer = __buffer; }

	else if (radius == 49)  { int __buffer[49];   buffer = __buffer; }

	int ii;

	for (ii = 0; ii < radius; ii++)

		buffer[ii] = ii + rowIndex;

	for (ii = 0; ii < width; ii++) {

		if (ii < radius) { 

			out[rowIndex * width + ii] = buffer[ii];

		} else {

			out[rowIndex * width + ii] = row[ii];

		}

	}

}

int main(void) {

	int ii, jj;

	int width = 8, height = 8;

	int radius = 7;

	int dim = 8;

	int * in  = (int *) malloc(width * height * sizeof(int));

	int * out = (int *) malloc(width * height * sizeof(int));

	int * d_in, * d_out;

	

	cudaMalloc((void **) &d_in,  width * height * sizeof(int));

	cudaMalloc((void **) &d_out, width * height * sizeof(int));

	for (ii = 0; ii < width*height; ii++)

		in[ii] = 0;

	cudaMemcpy(d_in, in, width * height * sizeof(int), cudaMemcpyHostToDevice);

	dim3 blocksize(dim);

	dim3 gridsize(height/blocksize.x);

	demo<<<gridsize, blocksize>>>(d_in, d_out, width, height, radius);

	cudaMemcpy(out, d_out, width * height * sizeof(int), cudaMemcpyDeviceToHost);

	for (ii = 0; ii < height; ii++) {

		for (jj = 0; jj < width; jj++) {

			printf("%d\t", out[ii * width + jj]);

		}

		printf("\n");

	}

	

	return 0;

}

no because the size of the array is not known at compile time. you’re depending on incorrect behavior in pre-C99 compilers.

are you referring to post #12?

yes? you’re still sizing the array at runtime, it doesn’t work

This is tricky, because you could imagine a compiler which would statically allocate each one of those arrays, even though each lives in its own branch. I have no idea what nvcc will try to do.

A better way to implement what you want would be to make the array length depend upon a template parameter:

template<unsigned int radius>

__global__ void my_kernel(int3 * in, int3 * out) {

				

	int dx = blockDim.x;

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	

	int3 buffer[DIM + 2*radius];

}

Then, you could “dynamically” choose the length of the array with a switch:

switch(radius) {

  case 0: return;

  case 1: my_kernel<1>...

  case 2: my_kernel<2>...

  etc...

}

I think it’s the combination of command line options. When I tried, I had “-ansi -pedantic” on my gcc command line too. Then, the dynamic local allocation was rejected by gcc.

ok, so the following works, but it is ugly. I am wondering if one can define a macro for templates (sorry I am not a c++ person).

The size of the binary is also 3 times larger than the program I posted in post #12 and it takes longer to compile.

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

template <unsigned int radius>

__global__ void demo(int * in, int * out, int width, int height) {

	unsigned int rowIndex = blockIdx.x*blockDim.x + threadIdx.x;

	int * row = &(in[rowIndex * width]);

	if (rowIndex > height) return;

	int buffer[radius];

	int ii;

	for (ii = 0; ii < radius; ii++)

		buffer[ii] = ii + rowIndex;

	for (ii = 0; ii < width; ii++) {

		if (ii < radius) { 

			out[rowIndex * width + ii] = buffer[ii];

		} else {

			out[rowIndex * width + ii] = row[ii];

		}

	}

}

int main(void) {

	int ii, jj;

	int width = 8, height = 8;

	int radius = 7;

	int dim = 8;

	int * in  = (int *) malloc(width * height * sizeof(int));

	int * out = (int *) malloc(width * height * sizeof(int));

	int * d_in, * d_out;

	

	cudaMalloc((void **) &d_in,  width * height * sizeof(int));

	cudaMalloc((void **) &d_out, width * height * sizeof(int));

	for (ii = 0; ii < width*height; ii++)

		in[ii] = 0;

	cudaMemcpy(d_in, in, width * height * sizeof(int), cudaMemcpyHostToDevice);

	dim3 blocksize(dim);

	dim3 gridsize(height/blocksize.x);

	switch (radius) {

		case 0: return 0;

		case 1:   demo<1><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 2:   demo<2><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 3:   demo<3><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 4:   demo<4><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 5:   demo<6><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 7:   demo<7><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 8:   demo<8><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 9:   demo<9><<<gridsize, blocksize>>>(d_in, d_out, width, height);  break;

		case 10:  demo<10><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 11:  demo<11><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 12:  demo<12><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 13:  demo<13><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 14:  demo<14><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 15:  demo<16><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 17:  demo<17><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 18:  demo<18><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 19:  demo<19><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 20:  demo<20><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 21:  demo<21><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 22:  demo<22><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 23:  demo<23><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 24:  demo<24><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 25:  demo<26><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 27:  demo<27><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 28:  demo<28><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 29:  demo<29><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 30:  demo<30><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 31:  demo<31><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 32:  demo<32><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 33:  demo<33><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 34:  demo<34><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 35:  demo<36><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 37:  demo<37><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 38:  demo<38><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 39:  demo<39><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 40:  demo<40><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 41:  demo<41><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 42:  demo<42><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 43:  demo<43><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 44:  demo<44><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 45:  demo<46><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 47:  demo<47><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 48:  demo<48><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

		case 49:  demo<49><<<gridsize, blocksize>>>(d_in, d_out, width, height); break;

	}

	cudaMemcpy(out, d_out, width * height * sizeof(int), cudaMemcpyDeviceToHost);

	for (ii = 0; ii < height; ii++) {

		for (jj = 0; jj < width; jj++) {

			printf("%d\t", out[ii * width + jj]);

		}

		printf("\n");

	}

	

	return 0;

}

What exactly is this kernel meant to compute? Is what you’ve posted the actual computation you’re interested in?

Your code seems to launch a thread per row to copy a widthheight array, except it treats the first radius elements of each row specially. Is this correct? If so, why not launch widthheight threads and do away with the buffer entirely?

This is an example. My code, which I have not posted, does image processing on images with arbitrary apron radius, so I need to do something like

shared int3 smem[blockDim.x + 2radius + 1][blockDim.y + 2radius + 1]

which I was hoping to do, but apparently cannot

thanks for all the help thus far