Unspecified launch failure on a nested loop (C - multiple pattern matching algorithm) A double loop

sushi.perv · November 19, 2010, 12:03pm

Hello, i have been trying to run the wu-manber multiple pattern matching algorithm on a cuda gpu. I’m taking baby steps because i am new to cuda and the algorithm right now is in the process of running on a single block-thread of global device memory.

The problem kicks in when i run a double loop in the start of the alogrithm, it returns the cute little error called:

Cuda error: kernel synchronization: unspecified launch failure.

When i delete the loop, all the program runs smoothly. (i suppose because i get no such error).

Here is the host code:

struct  prefixArray {

	int value[];		

	int size;		

	int index[];		

};

int main( int argc, char** argv)

{	long int n;

    	int m, alphabet, p_size, i = 0;

        char *text_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	char *pattern_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	// device memory pointers    	

    	unsigned char **d_pattern, *d_text; 

	int *d_matches;

	size_t pitch;

	// define grid and block size 

	int numThreadsPerBlock, numBlocks;

	int sharedMemSize = 16000;

	// Allocate the maximum amount of shared memory

	if( sharedMemSize > 16384 )

		fail("Requested more shared memory than the capability of the hardware! \n");

// size of text

    	size_t memSize_t = n * sizeof( char );

	// host memory allocation

    	unsigned char *text = ( unsigned char * ) malloc ( memSize_t );

	unsigned char **pattern = ( unsigned char ** ) malloc ( m * sizeof( unsigned char * ));

	for( i = 0; i < m; i++ ) {

		pattern[i] = ( unsigned char * ) malloc ( m * sizeof( unsigned char ));

	}

	int *cuda_matches = ( int * ) malloc ( sizeof( int ) * m );

	int *results_array = ( int * ) malloc ( sizeof( int ) * ( ( n - m + 1) / m ) );

	text_filename = "data/text_test";

	pattern_filename = "data/pattern_test";

	load_files( pattern, text, m, n, pattern_filename, text_filename, p_size );

	// cuda device memory allocation

	cudaMallocPitch( (void**) &d_pattern, &pitch, m * sizeof(unsigned char * ), p_size );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_text, memSize_t );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_matches, m * sizeof(int)  );

	checkCUDAError("Malloc");

	// copy host memory to device memory

	cudaMemcpy2D( d_pattern, pitch, pattern, m * p_size, m * sizeof(unsigned char * ), p_size, cudaMemcpyHostToDevice );

	cudaMemcpy( d_text, text, memSize_t, cudaMemcpyHostToDevice );

	cudaMemcpy( d_matches, cuda_matches, m * sizeof(int *), cudaMemcpyHostToDevice );	

	// launch kernel

	dim3 dimGrid(numBlocks);

	dim3 dimBlock(numThreadsPerBlock);

long shiftsize = 2 * power( 180, 3 ); // B = 3

	int *d_SHIFT;

	cudaMalloc( (void **) &d_SHIFT, sizeof( int ) * shiftsize );

	checkCUDAError("Malloc");

	struct prefixArray *d_PREFIX;

	cudaMalloc( (void **) &d_PREFIX, sizeof( struct prefixArray ) * shiftsize);

	checkCUDAError("Malloc");	

	// kernel launch

	wuKernel<<< dimGrid, dimBlock, sharedMemSize >>>(pitch, d_pattern, d_text, d_matches, m, n, numBlocks, shiftsize, d_SHIFT, d_PREFIX, p_size, alphabet);

	checkCUDAError("kernel invocation");

// block until the device has completed

	cudaThreadSynchronize();

	checkCUDAError("kernel synchronization");

// free memory.....

}

And the kernel code (where the error occurs “//for each pattern”):

__global__ void wuKernel(int pitch, unsigned char **d_pattern, unsigned char *d_text, int *d_matches, int m, long n, int blocksize, long shiftsize, int *d_SHIFT, struct prefixArray *d_PREFIX, int p_size, int alphabet) {

int i, j, q, B = 3;

	d_matches = 0;

	struct d_structAlphabet {

		char letter;		

		unsigned char offset; 	

	} d_lookup[128];			

	unsigned char m_nSizeOfAlphabet = 1;

	unsigned short m_nBitsInShift = (unsigned short) ceil( log( (double) m_nSizeOfAlphabet ) / log( (double) 2 ) )

	unsigned int hash, hash1, hash2;

	size_t shift, shiftlen, index = m - 1, prefixhash;

	for ( i = 0; i < shiftsize; i++ )

		d_SHIFT[i] = m - B + 1;

	//Map characters to index

	for ( i = 0; i <= 128; ++i ) {

		if ( ( i >= 33 ) && ( i <= 126 ) ) {

			d_lookup[i].letter = (char) i; 

			d_lookup[i].offset = m_nSizeOfAlphabet++;

		}

		else {

			d_lookup[i].letter = ' '; // table is defaulted to whitespace

			d_lookup[i].offset = 0;   

		}

	}

	//for each pattern

	for ( j = 0; j < p_size; ++j ) {

		//add each 3-character subpattern (similar to q-grams)

		for ( q = m; q >= B; --q ) {

			hash  = d_lookup[d_pattern[j][q - 2 - 1]].offset; // bring in offsets of X in pattern j

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q - 1 - 1]].offset;

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q     - 1]].offset;

			shiftlen = m - q;

			d_SHIFT[hash] = MIN( d_SHIFT[hash], shiftlen );

			//calculate the hash of the prefixes for each pattern

			if ( shiftlen == 0 ) {

				prefixhash = d_lookup[d_pattern[j][0]].offset;

				prefixhash <<= m_nBitsInShift;

				prefixhash += d_lookup[d_pattern[j][1]].offset;

				d_PREFIX[hash].value[d_PREFIX[hash].size] = prefixhash;

				d_PREFIX[hash].index[d_PREFIX[hash].size] = j;

				d_PREFIX[hash].size++;

			}

		}

	}

//more code.....

}

I was thinking maybe i do something wrong with the cudamallocpitch or i dont use the pitch correctly. Any ideas?

sushi.perv · November 19, 2010, 12:03pm

Hello, i have been trying to run the wu-manber multiple pattern matching algorithm on a cuda gpu. I’m taking baby steps because i am new to cuda and the algorithm right now is in the process of running on a single block-thread of global device memory.

The problem kicks in when i run a double loop in the start of the alogrithm, it returns the cute little error called:

Cuda error: kernel synchronization: unspecified launch failure.

When i delete the loop, all the program runs smoothly. (i suppose because i get no such error).

Here is the host code:

struct  prefixArray {

	int value[];		

	int size;		

	int index[];		

};

int main( int argc, char** argv)

{	long int n;

    	int m, alphabet, p_size, i = 0;

        char *text_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	char *pattern_filename = ( char * ) malloc ( sizeof( char ) * 100 );

	// device memory pointers    	

    	unsigned char **d_pattern, *d_text; 

	int *d_matches;

	size_t pitch;

	// define grid and block size 

	int numThreadsPerBlock, numBlocks;

	int sharedMemSize = 16000;

	// Allocate the maximum amount of shared memory

	if( sharedMemSize > 16384 )

		fail("Requested more shared memory than the capability of the hardware! \n");

// size of text

    	size_t memSize_t = n * sizeof( char );

	// host memory allocation

    	unsigned char *text = ( unsigned char * ) malloc ( memSize_t );

	unsigned char **pattern = ( unsigned char ** ) malloc ( m * sizeof( unsigned char * ));

	for( i = 0; i < m; i++ ) {

		pattern[i] = ( unsigned char * ) malloc ( m * sizeof( unsigned char ));

	}

	int *cuda_matches = ( int * ) malloc ( sizeof( int ) * m );

	int *results_array = ( int * ) malloc ( sizeof( int ) * ( ( n - m + 1) / m ) );

	text_filename = "data/text_test";

	pattern_filename = "data/pattern_test";

	load_files( pattern, text, m, n, pattern_filename, text_filename, p_size );

	// cuda device memory allocation

	cudaMallocPitch( (void**) &d_pattern, &pitch, m * sizeof(unsigned char * ), p_size );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_text, memSize_t );

	checkCUDAError("Malloc");

	cudaMalloc( (void **) &d_matches, m * sizeof(int)  );

	checkCUDAError("Malloc");

	// copy host memory to device memory

	cudaMemcpy2D( d_pattern, pitch, pattern, m * p_size, m * sizeof(unsigned char * ), p_size, cudaMemcpyHostToDevice );

	cudaMemcpy( d_text, text, memSize_t, cudaMemcpyHostToDevice );

	cudaMemcpy( d_matches, cuda_matches, m * sizeof(int *), cudaMemcpyHostToDevice );	

	// launch kernel

	dim3 dimGrid(numBlocks);

	dim3 dimBlock(numThreadsPerBlock);

long shiftsize = 2 * power( 180, 3 ); // B = 3

	int *d_SHIFT;

	cudaMalloc( (void **) &d_SHIFT, sizeof( int ) * shiftsize );

	checkCUDAError("Malloc");

	struct prefixArray *d_PREFIX;

	cudaMalloc( (void **) &d_PREFIX, sizeof( struct prefixArray ) * shiftsize);

	checkCUDAError("Malloc");	

	// kernel launch

	wuKernel<<< dimGrid, dimBlock, sharedMemSize >>>(pitch, d_pattern, d_text, d_matches, m, n, numBlocks, shiftsize, d_SHIFT, d_PREFIX, p_size, alphabet);

	checkCUDAError("kernel invocation");

// block until the device has completed

	cudaThreadSynchronize();

	checkCUDAError("kernel synchronization");

// free memory.....

}

And the kernel code (where the error occurs “//for each pattern”):

__global__ void wuKernel(int pitch, unsigned char **d_pattern, unsigned char *d_text, int *d_matches, int m, long n, int blocksize, long shiftsize, int *d_SHIFT, struct prefixArray *d_PREFIX, int p_size, int alphabet) {

int i, j, q, B = 3;

	d_matches = 0;

	struct d_structAlphabet {

		char letter;		

		unsigned char offset; 	

	} d_lookup[128];			

	unsigned char m_nSizeOfAlphabet = 1;

	unsigned short m_nBitsInShift = (unsigned short) ceil( log( (double) m_nSizeOfAlphabet ) / log( (double) 2 ) )

	unsigned int hash, hash1, hash2;

	size_t shift, shiftlen, index = m - 1, prefixhash;

	for ( i = 0; i < shiftsize; i++ )

		d_SHIFT[i] = m - B + 1;

	//Map characters to index

	for ( i = 0; i <= 128; ++i ) {

		if ( ( i >= 33 ) && ( i <= 126 ) ) {

			d_lookup[i].letter = (char) i; 

			d_lookup[i].offset = m_nSizeOfAlphabet++;

		}

		else {

			d_lookup[i].letter = ' '; // table is defaulted to whitespace

			d_lookup[i].offset = 0;   

		}

	}

	//for each pattern

	for ( j = 0; j < p_size; ++j ) {

		//add each 3-character subpattern (similar to q-grams)

		for ( q = m; q >= B; --q ) {

			hash  = d_lookup[d_pattern[j][q - 2 - 1]].offset; // bring in offsets of X in pattern j

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q - 1 - 1]].offset;

			hash <<= m_nBitsInShift;

			hash += d_lookup[d_pattern[j][q     - 1]].offset;

			shiftlen = m - q;

			d_SHIFT[hash] = MIN( d_SHIFT[hash], shiftlen );

			//calculate the hash of the prefixes for each pattern

			if ( shiftlen == 0 ) {

				prefixhash = d_lookup[d_pattern[j][0]].offset;

				prefixhash <<= m_nBitsInShift;

				prefixhash += d_lookup[d_pattern[j][1]].offset;

				d_PREFIX[hash].value[d_PREFIX[hash].size] = prefixhash;

				d_PREFIX[hash].index[d_PREFIX[hash].size] = j;

				d_PREFIX[hash].size++;

			}

		}

	}

//more code.....

}

I was thinking maybe i do something wrong with the cudamallocpitch or i dont use the pitch correctly. Any ideas?

avidday · November 19, 2010, 12:50pm

Unspecified launch error usually means out of bounds memory access, probably something in the calculation of hash is incorrect. Try using cudamemcheck and see what it reports.

avidday · November 19, 2010, 12:50pm

Unspecified launch error usually means out of bounds memory access, probably something in the calculation of hash is incorrect. Try using cudamemcheck and see what it reports.

sushi.perv · November 22, 2010, 8:57am

Thanks for the reply,

i tried cuda-gdb but it fails because of the X11. I am running the code through ssh and no one was at the lab on weekend so i couldn’t change it to console mode.

Then i ran it through cuda-memcheck and it returns the following error:

Cuda error: Kernel Synchronization: the launch timed out and was terminated.

I believe the gui of the os does the trick. Going to disable it today and see what happens.

sushi.perv · November 22, 2010, 8:57am

Thanks for the reply,

i tried cuda-gdb but it fails because of the X11. I am running the code through ssh and no one was at the lab on weekend so i couldn’t change it to console mode.

Then i ran it through cuda-memcheck and it returns the following error:

Cuda error: Kernel Synchronization: the launch timed out and was terminated.

I believe the gui of the os does the trick. Going to disable it today and see what happens.