strange unspecified launch failure

hey

i’m working at a simple parallel bucket sort and one of my kernels keep returning me this error.

__global__ static void filler (int * poleDat, int * vyslednePole, int * pocty, int pocetPrvkov, int velkost, int pocetSubPostupnosti)

	{

		int i=0;

		int j= 0;

		int tmp=0;

		int tx = threadIdx.x;

		int ini = 0;

		

		

		

		if(pocty[tx] != 0) {

		for(i=0;i<tx;i++)

			{

				ini = ini + pocty[i];						//counting the position for the first element of the bucket

			}

		

		for (i=0;i<pocetPrvkov;i++)						//looping through the array 

			{

			if ((poleDat[i]>=tx*velkost) && (poleDat[i]<=((tx+1)*velkost)))		//determin if the current element fits into the array

				{ 

					vyslednePole[ini+j]=poleDat[i];								//if it does, save it to the first free position in the bucket

					j++;														//position counter in the bucket

				}

			}

							}

}

it seems that the error is caused by the line

vyslednePole[ini+j]=poleDat[i];

the variable vyslednePole is allocated in the host part this way

int * poleVysledne = AllocateOnDevice(pocetPrvkov);

int * AllocateOnDevice (int pocet)

	{

	int * Mdevice;

	int size = pocet * sizeof(int);

	cudaMalloc((void**)&Mdevice, size);

	return Mdevice;

	}

and then passed like an argument to the kernel. poleDat is allocated just the same way and also passed like an argument and the number of elemets is also pocetPrvkov so i see no problem here. the strange thing is that i keep getting this error even if i try to do just a simple copy of the array.

for (i=0;i<pocetPrvkov;i++)  vyslednePole[i]=poleDat[i];

i would really appreciate any ideas what could it be caused with :">

You can try to run your kernel in emulation mode. In emulation mode, it is possible to call host c-functions like printf from a kernel function to point out the real position for example of segmentation faults.

yes i forrgot to mention that in the emulation mode everithing is going just fine … i think that this line it’s causing the probelm … becouse if i comment it … then the error disappear …

I tried your code on my device, and I got no launch errors.

I used test.cu

[codebox]global static void filler (int * poleDat, int * vyslednePole, int * pocty, int pocetPrvkov, int velkost, int pocetSubPostupnosti)

{

int i=0;

    int j= 0;

    int tmp=0;

    int tx = threadIdx.x;

    int ini = 0;

if(pocty[tx] != 0) {

    for(i=0;i<tx;i++)

        {

            ini = ini + pocty[i];                        //counting the position for the first element of the bucket

        }

for (i=0;i<pocetPrvkov;i++) //looping through the array

        {

        if ((poleDat[i]>=tx*velkost) && (poleDat[i]<=((tx+1)*velkost)))        //determin if the current element fits into the array

            {

                vyslednePole[ini+j]=poleDat[i];                                //if it does, save it to the first free position in the bucket

                j++;                                                        //position counter in the bucket

            }

        }

}

}

int * AllocateOnDevice (int pocet)

{

int * Mdevice;

int size = pocet * sizeof(int);

cudaMalloc((void**)&Mdevice, size);

return Mdevice;

}

int main()

{

int pocetPrvkov = 100;

int * poleVysledne = AllocateOnDevice(pocetPrvkov);

int * poleDat = AllocateOnDevice(pocetPrvkov);

int * pocty = AllocateOnDevice(512);

int *pocty_h = (int*)malloc(sizeof(int) * 512 );

for( int i = 0; i < 512; i++ )

{

	pocty_h[ i ] = i;

}

cudaMemcpy( pocty, pocty_h, 512 * sizeof( int ), cudaMemcpyHostToDevice );

dim3 dimGrid( 1,1);

dim3 dimBlock( 512 );

filler<<< dimGrid, dimBlock >>>( poleDat, poleVysledne, pocty, pocetPrvkov, 123, 456 );

cudaFree( poleVysledne );

cudaFree( poleDat );

cudaFree( pocty );

return 0;

}

[/codebox]

[codebox]linux-x4gv:~/cuda> nvcc -o test test.cu

test.cu(9): warning: variable “tmp” was declared but never referenced

test.cu(9): warning: variable “tmp” was declared but never referenced

linux-x4gv:~/cuda> ./test

linux-x4gv:~/cuda>

[/codebox]

Did you use cudaFree() to free your device memory? Perhaps you allocated to much memory?

edit: to ensure the call of your critical line I commented

[codebox]

if ((poleDat[i]>=tx*velkost) && (poleDat[i]<=((tx+1)*velkost)))

[/codebox]

and got no error, too.

i’m not getting this error right after the compilation this will go ok … but if i give a cudaGetLastError() after this kernel invocation i will get it and i cannot get any data from vyslednePole with cudaMemcpy() … it’s weird …

but anyway thank you for your help :)

use -deviceemu and valgrind, it should show you exactly what the problem is