Hi, i’m new here…I hope i’ve choosen the right section to post about a problem i can’t resolve.
When i launch my kernel i’ve got this error: Unspecified launch failure.
I don’t know why i get this error, but i have realized that if i comment an instruction, the kernel works (but don’t do his job obviously).
This is my kernel (i know that i have to work a lot for performance but now it’is secondary), the line is precedeed by a “***”.
The strange thing is that if i copy this line before the for loop the kernel works.
__global__ void ricerca_cuda(int *delta,int *phi,int *finalState,int *state_results,char *input_buffer){
int statoCorrente=0;
int patternIndex=0;
int idx= blockIdx.x*blockDim.x +threadIdx.x;
int j,deltaState;
char symbol='
global void ricerca_cuda(int *delta,int *phi,int *finalState,int *state_results,char *input_buffer){
int statoCorrente=0;
int patternIndex=0;
int idx= blockIdx.x*blockDim.x +threadIdx.x;
int j,deltaState;
char symbol='\0';
if(idx<MaxInput){
symbol=input_buffer[idx*MaxInputLength];
for(j=0;symbol!='\0';j++){
deltaState=delta[statoCorrente*AlphabetLength+(int)symbol];
if(deltaState!=-1){
statoCorrente=deltaState;
if(finalState[statoCorrente]==1){
*** state_results[idx*MaxPattern+patternIndex]=statoCorrente;
patternIndex++;
}
}
else if(statoCorrente!=0){
j--;
statoCorrente=phi[statoCorrente];
}
symbol=input_buffer[idx*MaxInputLength+j];
}
}
}
';
if(idx<MaxInput){
symbol=input_buffer[idx*MaxInputLength];
for(j=0;symbol!='
global void ricerca_cuda(int *delta,int *phi,int *finalState,int *state_results,char *input_buffer){
int statoCorrente=0;
int patternIndex=0;
int idx= blockIdx.x*blockDim.x +threadIdx.x;
int j,deltaState;
char symbol='\0';
if(idx<MaxInput){
symbol=input_buffer[idx*MaxInputLength];
for(j=0;symbol!='\0';j++){
deltaState=delta[statoCorrente*AlphabetLength+(int)symbol];
if(deltaState!=-1){
statoCorrente=deltaState;
if(finalState[statoCorrente]==1){
*** state_results[idx*MaxPattern+patternIndex]=statoCorrente;
patternIndex++;
}
}
else if(statoCorrente!=0){
j--;
statoCorrente=phi[statoCorrente];
}
symbol=input_buffer[idx*MaxInputLength+j];
}
}
}
';j++){
deltaState=delta[statoCorrente*AlphabetLength+(int)symbol];
if(deltaState!=-1){
statoCorrente=deltaState;
if(finalState[statoCorrente]==1){
*** state_results[idx*MaxPattern+patternIndex]=statoCorrente;
patternIndex++;
}
}
else if(statoCorrente!=0){
j--;
statoCorrente=phi[statoCorrente];
}
symbol=input_buffer[idx*MaxInputLength+j];
}
}
}
This is the malloc and memcpy part:
//Allocation of host arrays
phi=(int*)malloc(MaxStates*sizeof(int));
delta=(int*)malloc(MaxStates*AlphabetLength*sizeof(int));
finalState=(int*)malloc(MaxStates*sizeof(int));
state_results=(int*)malloc((MaxInput+1)*MaxPattern*sizeof(int));
//cudaMalloc and cudaMemcpy of phi
cudaMalloc((void**)&phi_d,MaxStates*sizeof(int));
cudaMemcpy(phi_d,phi,MaxStates*sizeof(int),cudaMemcpyHostToDevice);
//cudaMalloc and cudaMemcpy of delta
cudaMalloc((void**)&delta_d,MaxStates*AlphabetLength*sizeof(int));
cudaMemcpy(delta_d,delta,MaxStates*AlphabetLength*sizeof(int),cudaMemcpyHostToDevice);
//cudaMalloc and cudaMemcpy of input_buffer (previusly allocated)
cudaMalloc((void**)&input_buffer_d,(MaxInput +1)*MaxInputLength*sizeof(char));
cudaMemcpy(input_buffer_d,input_buffer_p,(MaxInput +1)*MaxInputLength*sizeof(char),cudaMemcpyHostToDevice);
//cudaMalloc and cudaMemcpy of finalState
cudaMalloc((void**)&finalState_d,MaxStates*sizeof(int));
cudaMemcpy(finalState_d,finalState,MaxStates*sizeof(int),cudaMemcpyHostToDevice);
//cudaMalloc and cudaMemcpy of state_results
cudaMalloc((void**)&state_results_d,MaxInput*MaxPattern*sizeof(int));
cudaMemcpy(state_results_d,state_results,MaxInput*MaxPattern*sizeof(int),cudaMemcpyHostToDevice);
This is the kernel invocation:
void ricercaCuda()
{
dim3 dimBlock(256,1,1);
dim3 dimGrid(32,1);
printf("Before kernel launch: %s\n",cudaGetErrorString(cudaGetLastError()));
ricerca_cuda <<<dimGrid,dimBlock>>>(delta_d,phi_d,finalState_d,state_results_d,input_buffer_
d);
cudaThreadSynchronize();
printf("After kernel launchl: %s\n",cudaGetErrorString(cudaGetLastError()));
cudaMemcpy(state_results,state_results_d,MaxInput*MaxPattern*sizeof(int),cudaMemcpyDeviceToHost);
printf("After cudaMemcpy: %s\n",cudaGetErrorString(cudaGetLastError()));
cudaThreadExit();
}
The output is:
Before kernel launch: no error
After kernel launch: unspecified launch failure
After cudaMemcpy: unspecified launch failure
Anyone can help me?