Unspecified launch failure

Hi, i’m new here…I hope i’ve choosen the right section to post about a problem i can’t resolve.

When i launch my kernel i’ve got this error: Unspecified launch failure.

I don’t know why i get this error, but i have realized that if i comment an instruction, the kernel works (but don’t do his job obviously).

This is my kernel (i know that i have to work a lot for performance but now it’is secondary), the line is precedeed by a “***”.

The strange thing is that if i copy this line before the for loop the kernel works.

__global__ void ricerca_cuda(int *delta,int *phi,int *finalState,int *state_results,char *input_buffer){

	int statoCorrente=0;

	int patternIndex=0;

	int idx= blockIdx.x*blockDim.x +threadIdx.x;

	int j,deltaState;

	char symbol='

global void ricerca_cuda(int *delta,int *phi,int *finalState,int *state_results,char *input_buffer){

int statoCorrente=0;

int patternIndex=0;

int idx= blockIdx.x*blockDim.x +threadIdx.x;

int j,deltaState;

char symbol='\0';

if(idx<MaxInput){

	symbol=input_buffer[idx*MaxInputLength];

	

	for(j=0;symbol!='\0';j++){

		deltaState=delta[statoCorrente*AlphabetLength+(int)symbol];			

		if(deltaState!=-1){

			statoCorrente=deltaState;	

			if(finalState[statoCorrente]==1){

*** state_results[idx*MaxPattern+patternIndex]=statoCorrente;

				patternIndex++;

			  }

		}

		else if(statoCorrente!=0){

			j--;

			statoCorrente=phi[statoCorrente];

		}

		symbol=input_buffer[idx*MaxInputLength+j];			

		}

}

}

';

if(idx<MaxInput){

		symbol=input_buffer[idx*MaxInputLength];

		

		for(j=0;symbol!='

global void ricerca_cuda(int *delta,int *phi,int *finalState,int *state_results,char *input_buffer){

int statoCorrente=0;

int patternIndex=0;

int idx= blockIdx.x*blockDim.x +threadIdx.x;

int j,deltaState;

char symbol='\0';

if(idx<MaxInput){

	symbol=input_buffer[idx*MaxInputLength];

	

	for(j=0;symbol!='\0';j++){

		deltaState=delta[statoCorrente*AlphabetLength+(int)symbol];			

		if(deltaState!=-1){

			statoCorrente=deltaState;	

			if(finalState[statoCorrente]==1){

*** state_results[idx*MaxPattern+patternIndex]=statoCorrente;

				patternIndex++;

			  }

		}

		else if(statoCorrente!=0){

			j--;

			statoCorrente=phi[statoCorrente];

		}

		symbol=input_buffer[idx*MaxInputLength+j];			

		}

}

}

';j++){

			deltaState=delta[statoCorrente*AlphabetLength+(int)symbol];			

			if(deltaState!=-1){

				statoCorrente=deltaState;	

				if(finalState[statoCorrente]==1){

***					state_results[idx*MaxPattern+patternIndex]=statoCorrente;

					patternIndex++;

				  }

			}

			else if(statoCorrente!=0){

				j--;

				statoCorrente=phi[statoCorrente];

			}

			symbol=input_buffer[idx*MaxInputLength+j];			

			}

	}

}

This is the malloc and memcpy part:

//Allocation of host arrays

	phi=(int*)malloc(MaxStates*sizeof(int));

	delta=(int*)malloc(MaxStates*AlphabetLength*sizeof(int));

	finalState=(int*)malloc(MaxStates*sizeof(int));

	state_results=(int*)malloc((MaxInput+1)*MaxPattern*sizeof(int));

	//cudaMalloc and cudaMemcpy of phi

	cudaMalloc((void**)&phi_d,MaxStates*sizeof(int));

	cudaMemcpy(phi_d,phi,MaxStates*sizeof(int),cudaMemcpyHostToDevice);

	

	//cudaMalloc and cudaMemcpy of delta

	cudaMalloc((void**)&delta_d,MaxStates*AlphabetLength*sizeof(int));

	cudaMemcpy(delta_d,delta,MaxStates*AlphabetLength*sizeof(int),cudaMemcpyHostToDevice);

	//cudaMalloc and cudaMemcpy of input_buffer (previusly allocated)

	cudaMalloc((void**)&input_buffer_d,(MaxInput +1)*MaxInputLength*sizeof(char));

	cudaMemcpy(input_buffer_d,input_buffer_p,(MaxInput +1)*MaxInputLength*sizeof(char),cudaMemcpyHostToDevice);

	//cudaMalloc and cudaMemcpy of finalState

	cudaMalloc((void**)&finalState_d,MaxStates*sizeof(int));

	cudaMemcpy(finalState_d,finalState,MaxStates*sizeof(int),cudaMemcpyHostToDevice);

	//cudaMalloc and cudaMemcpy of state_results

	cudaMalloc((void**)&state_results_d,MaxInput*MaxPattern*sizeof(int));

	cudaMemcpy(state_results_d,state_results,MaxInput*MaxPattern*sizeof(int),cudaMemcpyHostToDevice);

This is the kernel invocation:

void ricercaCuda()

{	

	

	dim3 dimBlock(256,1,1);

	dim3 dimGrid(32,1);

		

	printf("Before kernel launch: %s\n",cudaGetErrorString(cudaGetLastError()));

	ricerca_cuda <<<dimGrid,dimBlock>>>(delta_d,phi_d,finalState_d,state_results_d,input_buffer_

d);

	cudaThreadSynchronize();

	printf("After kernel launchl: %s\n",cudaGetErrorString(cudaGetLastError()));

	

	cudaMemcpy(state_results,state_results_d,MaxInput*MaxPattern*sizeof(int),cudaMemcpyDeviceToHost);

	printf("After cudaMemcpy: %s\n",cudaGetErrorString(cudaGetLastError()));

	cudaThreadExit();

}

The output is:

Before kernel launch: no error

After kernel launch: unspecified launch failure

After cudaMemcpy: unspecified launch failure

Anyone can help me?

I got similar behavior once. CUDA_UNKNOWN_ERROR was returned in any function after kernel launch.
As it turned out to be it was an out of bounds array access inside a kernel.
So check your kernel code and make sure you read/write from/to valid memory.

I have a similar function on host and it works, therefore i think i’m reading/writing valid memory.

I’m not completely sure of it because i can’t compile the kernel code in emu mode but i don’t think that out of bounds array access is the problem.

Edit: in addition to this if i move the line "state_results[idx*MaxPattern+patternIndex]=statoCorrent

e;" before the loop, it works.

If it works there i don’t understand why it shouldn’t work inside it