Unspecified launch failure on a nested loop (C - multiple pattern matching algorithm) A double loop

Hello, i have been trying to run the wu-manber multiple pattern matching algorithm on a cuda gpu. I’m taking baby steps because i am new to cuda and the algorithm right now is in the process of running on a single block-thread of global device memory.

The problem kicks in when i run a double loop in the start of the alogrithm, it returns the cute little error called:

Cuda error: kernel synchronization: unspecified launch failure.

When i delete the loop, all the program runs smoothly. (i suppose because i get no such error).

Here is the host code:

struct  prefixArray {

	int value[];		

	int size;		

	int index[];		

};

int main( int argc, char** argv)

{	long int n;

    	int m, alphabet, p_size, i = 0;

        char *text_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	char *pattern_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	// device memory pointers    	

    	unsigned char **d_pattern, *d_text; 

	int *d_matches;

	size_t pitch;

	// define grid and block size 

	int numThreadsPerBlock, numBlocks;

	int sharedMemSize = 16000;

	// Allocate the maximum amount of shared memory

	if( sharedMemSize > 16384 )

		fail("Requested more shared memory than the capability of the hardware! \n");

// size of text

    	size_t memSize_t = n * sizeof( char );

	// host memory allocation

    	unsigned char *text = ( unsigned char * ) malloc ( memSize_t );

	unsigned char **pattern = ( unsigned char ** ) malloc ( m * sizeof( unsigned char * ));

	for( i = 0; i < m; i++ ) {

		pattern[i] = ( unsigned char * ) malloc ( m * sizeof( unsigned char ));

	}

	int *cuda_matches = ( int * ) malloc ( sizeof( int ) * m );

	int *results_array = ( int * ) malloc ( sizeof( int ) * ( ( n - m + 1) / m ) );

	text_filename = "data/text_test";

	pattern_filename = "data/pattern_test";

	load_files( pattern, text, m, n, pattern_filename, text_filename, p_size );

	// cuda device memory allocation

	cudaMallocPitch( (void**) &d_pattern, &pitch, m * sizeof(unsigned char * ), p_size );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_text, memSize_t );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_matches, m * sizeof(int)  );

	checkCUDAError("Malloc");

	// copy host memory to device memory

	cudaMemcpy2D( d_pattern, pitch, pattern, m * p_size, m * sizeof(unsigned char * ), p_size, cudaMemcpyHostToDevice );

	cudaMemcpy( d_text, text, memSize_t, cudaMemcpyHostToDevice );

	cudaMemcpy( d_matches, cuda_matches, m * sizeof(int *), cudaMemcpyHostToDevice );	

	// launch kernel

	dim3 dimGrid(numBlocks);

	dim3 dimBlock(numThreadsPerBlock);

long shiftsize = 2 * power( 180, 3 ); // B = 3

	int *d_SHIFT;

	cudaMalloc( (void **) &d_SHIFT, sizeof( int ) * shiftsize );

	checkCUDAError("Malloc");

	struct prefixArray *d_PREFIX;

	cudaMalloc( (void **) &d_PREFIX, sizeof( struct prefixArray ) * shiftsize);

	checkCUDAError("Malloc");	

	// kernel launch

	wuKernel<<< dimGrid, dimBlock, sharedMemSize >>>(pitch, d_pattern, d_text, d_matches, m, n, numBlocks, shiftsize, d_SHIFT, d_PREFIX, p_size, alphabet);

	checkCUDAError("kernel invocation");

// block until the device has completed

	cudaThreadSynchronize();

	checkCUDAError("kernel synchronization");

// free memory.....

}

And the kernel code (where the error occurs “//for each pattern”):

__global__ void wuKernel(int pitch, unsigned char **d_pattern, unsigned char *d_text, int *d_matches, int m, long n, int blocksize, long shiftsize, int *d_SHIFT, struct prefixArray *d_PREFIX, int p_size, int alphabet) {

int i, j, q, B = 3;

	d_matches = 0;

	struct d_structAlphabet {

		char letter;		

		unsigned char offset; 	

	} d_lookup[128];			

	unsigned char m_nSizeOfAlphabet = 1;

	unsigned short m_nBitsInShift = (unsigned short) ceil( log( (double) m_nSizeOfAlphabet ) / log( (double) 2 ) )

	unsigned int hash, hash1, hash2;

	size_t shift, shiftlen, index = m - 1, prefixhash;

	for ( i = 0; i < shiftsize; i++ )

		d_SHIFT[i] = m - B + 1;

	//Map characters to index

	for ( i = 0; i <= 128; ++i ) {

		if ( ( i >= 33 ) && ( i <= 126 ) ) {

			d_lookup[i].letter = (char) i; 

			d_lookup[i].offset = m_nSizeOfAlphabet++;

		}

		else {

			d_lookup[i].letter = ' '; // table is defaulted to whitespace

			d_lookup[i].offset = 0;   

		}

	}

	//for each pattern

	for ( j = 0; j < p_size; ++j ) {

		//add each 3-character subpattern (similar to q-grams)

		for ( q = m; q >= B; --q ) {

			hash  = d_lookup[d_pattern[j][q - 2 - 1]].offset; // bring in offsets of X in pattern j

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q - 1 - 1]].offset;

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q     - 1]].offset;

			shiftlen = m - q;

			d_SHIFT[hash] = MIN( d_SHIFT[hash], shiftlen );

			//calculate the hash of the prefixes for each pattern

			if ( shiftlen == 0 ) {

				prefixhash = d_lookup[d_pattern[j][0]].offset;

				prefixhash <<= m_nBitsInShift;

				prefixhash += d_lookup[d_pattern[j][1]].offset;

				d_PREFIX[hash].value[d_PREFIX[hash].size] = prefixhash;

				d_PREFIX[hash].index[d_PREFIX[hash].size] = j;

				d_PREFIX[hash].size++;

			}

		}

	}

//more code.....

}

I was thinking maybe i do something wrong with the cudamallocpitch or i dont use the pitch correctly. Any ideas?

Hello, i have been trying to run the wu-manber multiple pattern matching algorithm on a cuda gpu. I’m taking baby steps because i am new to cuda and the algorithm right now is in the process of running on a single block-thread of global device memory.

The problem kicks in when i run a double loop in the start of the alogrithm, it returns the cute little error called:

Cuda error: kernel synchronization: unspecified launch failure.

When i delete the loop, all the program runs smoothly. (i suppose because i get no such error).

Here is the host code:

struct  prefixArray {

	int value[];		

	int size;		

	int index[];		

};

int main( int argc, char** argv)

{	long int n;

    	int m, alphabet, p_size, i = 0;

        char *text_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	char *pattern_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	// device memory pointers    	

    	unsigned char **d_pattern, *d_text; 

	int *d_matches;

	size_t pitch;

	// define grid and block size 

	int numThreadsPerBlock, numBlocks;

	int sharedMemSize = 16000;

	// Allocate the maximum amount of shared memory

	if( sharedMemSize > 16384 )

		fail("Requested more shared memory than the capability of the hardware! \n");

// size of text

    	size_t memSize_t = n * sizeof( char );

	// host memory allocation

    	unsigned char *text = ( unsigned char * ) malloc ( memSize_t );

	unsigned char **pattern = ( unsigned char ** ) malloc ( m * sizeof( unsigned char * ));

	for( i = 0; i < m; i++ ) {

		pattern[i] = ( unsigned char * ) malloc ( m * sizeof( unsigned char ));

	}

	int *cuda_matches = ( int * ) malloc ( sizeof( int ) * m );

	int *results_array = ( int * ) malloc ( sizeof( int ) * ( ( n - m + 1) / m ) );

	text_filename = "data/text_test";

	pattern_filename = "data/pattern_test";

	load_files( pattern, text, m, n, pattern_filename, text_filename, p_size );

	// cuda device memory allocation

	cudaMallocPitch( (void**) &d_pattern, &pitch, m * sizeof(unsigned char * ), p_size );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_text, memSize_t );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_matches, m * sizeof(int)  );

	checkCUDAError("Malloc");

	// copy host memory to device memory

	cudaMemcpy2D( d_pattern, pitch, pattern, m * p_size, m * sizeof(unsigned char * ), p_size, cudaMemcpyHostToDevice );

	cudaMemcpy( d_text, text, memSize_t, cudaMemcpyHostToDevice );

	cudaMemcpy( d_matches, cuda_matches, m * sizeof(int *), cudaMemcpyHostToDevice );	

	// launch kernel

	dim3 dimGrid(numBlocks);

	dim3 dimBlock(numThreadsPerBlock);

long shiftsize = 2 * power( 180, 3 ); // B = 3

	int *d_SHIFT;

	cudaMalloc( (void **) &d_SHIFT, sizeof( int ) * shiftsize );

	checkCUDAError("Malloc");

	struct prefixArray *d_PREFIX;

	cudaMalloc( (void **) &d_PREFIX, sizeof( struct prefixArray ) * shiftsize);

	checkCUDAError("Malloc");	

	// kernel launch

	wuKernel<<< dimGrid, dimBlock, sharedMemSize >>>(pitch, d_pattern, d_text, d_matches, m, n, numBlocks, shiftsize, d_SHIFT, d_PREFIX, p_size, alphabet);

	checkCUDAError("kernel invocation");

// block until the device has completed

	cudaThreadSynchronize();

	checkCUDAError("kernel synchronization");

// free memory.....

}

And the kernel code (where the error occurs “//for each pattern”):

__global__ void wuKernel(int pitch, unsigned char **d_pattern, unsigned char *d_text, int *d_matches, int m, long n, int blocksize, long shiftsize, int *d_SHIFT, struct prefixArray *d_PREFIX, int p_size, int alphabet) {

int i, j, q, B = 3;

	d_matches = 0;

	struct d_structAlphabet {

		char letter;		

		unsigned char offset; 	

	} d_lookup[128];			

	unsigned char m_nSizeOfAlphabet = 1;

	unsigned short m_nBitsInShift = (unsigned short) ceil( log( (double) m_nSizeOfAlphabet ) / log( (double) 2 ) )

	unsigned int hash, hash1, hash2;

	size_t shift, shiftlen, index = m - 1, prefixhash;

	for ( i = 0; i < shiftsize; i++ )

		d_SHIFT[i] = m - B + 1;

	//Map characters to index

	for ( i = 0; i <= 128; ++i ) {

		if ( ( i >= 33 ) && ( i <= 126 ) ) {

			d_lookup[i].letter = (char) i; 

			d_lookup[i].offset = m_nSizeOfAlphabet++;

		}

		else {

			d_lookup[i].letter = ' '; // table is defaulted to whitespace

			d_lookup[i].offset = 0;   

		}

	}

	//for each pattern

	for ( j = 0; j < p_size; ++j ) {

		//add each 3-character subpattern (similar to q-grams)

		for ( q = m; q >= B; --q ) {

			hash  = d_lookup[d_pattern[j][q - 2 - 1]].offset; // bring in offsets of X in pattern j

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q - 1 - 1]].offset;

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q     - 1]].offset;

			shiftlen = m - q;

			d_SHIFT[hash] = MIN( d_SHIFT[hash], shiftlen );

			//calculate the hash of the prefixes for each pattern

			if ( shiftlen == 0 ) {

				prefixhash = d_lookup[d_pattern[j][0]].offset;

				prefixhash <<= m_nBitsInShift;

				prefixhash += d_lookup[d_pattern[j][1]].offset;

				d_PREFIX[hash].value[d_PREFIX[hash].size] = prefixhash;

				d_PREFIX[hash].index[d_PREFIX[hash].size] = j;

				d_PREFIX[hash].size++;

			}

		}

	}

//more code.....

}

I was thinking maybe i do something wrong with the cudamallocpitch or i dont use the pitch correctly. Any ideas?

Unspecified launch error usually means out of bounds memory access, probably something in the calculation of hash is incorrect. Try using cudamemcheck and see what it reports.

Unspecified launch error usually means out of bounds memory access, probably something in the calculation of hash is incorrect. Try using cudamemcheck and see what it reports.

Thanks for the reply,

i tried cuda-gdb but it fails because of the X11. I am running the code through ssh and no one was at the lab on weekend so i couldn’t change it to console mode.

Then i ran it through cuda-memcheck and it returns the following error:

Cuda error: Kernel Synchronization: the launch timed out and was terminated.

I believe the gui of the os does the trick. Going to disable it today and see what happens.

Thanks for the reply,

i tried cuda-gdb but it fails because of the X11. I am running the code through ssh and no one was at the lab on weekend so i couldn’t change it to console mode.

Then i ran it through cuda-memcheck and it returns the following error:

Cuda error: Kernel Synchronization: the launch timed out and was terminated.

I believe the gui of the os does the trick. Going to disable it today and see what happens.