Hello, i have been trying to run the wu-manber multiple pattern matching algorithm on a cuda gpu. I’m taking baby steps because i am new to cuda and the algorithm right now is in the process of running on a single block-thread of global device memory.
The problem kicks in when i run a double loop in the start of the alogrithm, it returns the cute little error called:
Cuda error: kernel synchronization: unspecified launch failure.
When i delete the loop, all the program runs smoothly. (i suppose because i get no such error).
Here is the host code:
struct prefixArray {
int value[];
int size;
int index[];
};
int main( int argc, char** argv)
{ long int n;
int m, alphabet, p_size, i = 0;
char *text_filename = ( char * ) malloc ( sizeof( char ) * 100 );
char *pattern_filename = ( char * ) malloc ( sizeof( char ) * 100 );
// device memory pointers
unsigned char **d_pattern, *d_text;
int *d_matches;
size_t pitch;
// define grid and block size
int numThreadsPerBlock, numBlocks;
int sharedMemSize = 16000;
// Allocate the maximum amount of shared memory
if( sharedMemSize > 16384 )
fail("Requested more shared memory than the capability of the hardware! \n");
// size of text
size_t memSize_t = n * sizeof( char );
// host memory allocation
unsigned char *text = ( unsigned char * ) malloc ( memSize_t );
unsigned char **pattern = ( unsigned char ** ) malloc ( m * sizeof( unsigned char * ));
for( i = 0; i < m; i++ ) {
pattern[i] = ( unsigned char * ) malloc ( m * sizeof( unsigned char ));
}
int *cuda_matches = ( int * ) malloc ( sizeof( int ) * m );
int *results_array = ( int * ) malloc ( sizeof( int ) * ( ( n - m + 1) / m ) );
text_filename = "data/text_test";
pattern_filename = "data/pattern_test";
load_files( pattern, text, m, n, pattern_filename, text_filename, p_size );
// cuda device memory allocation
cudaMallocPitch( (void**) &d_pattern, &pitch, m * sizeof(unsigned char * ), p_size );
checkCUDAError("Malloc");
cudaMalloc( (void **) &d_text, memSize_t );
checkCUDAError("Malloc");
cudaMalloc( (void **) &d_matches, m * sizeof(int) );
checkCUDAError("Malloc");
// copy host memory to device memory
cudaMemcpy2D( d_pattern, pitch, pattern, m * p_size, m * sizeof(unsigned char * ), p_size, cudaMemcpyHostToDevice );
cudaMemcpy( d_text, text, memSize_t, cudaMemcpyHostToDevice );
cudaMemcpy( d_matches, cuda_matches, m * sizeof(int *), cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
long shiftsize = 2 * power( 180, 3 ); // B = 3
int *d_SHIFT;
cudaMalloc( (void **) &d_SHIFT, sizeof( int ) * shiftsize );
checkCUDAError("Malloc");
struct prefixArray *d_PREFIX;
cudaMalloc( (void **) &d_PREFIX, sizeof( struct prefixArray ) * shiftsize);
checkCUDAError("Malloc");
// kernel launch
wuKernel<<< dimGrid, dimBlock, sharedMemSize >>>(pitch, d_pattern, d_text, d_matches, m, n, numBlocks, shiftsize, d_SHIFT, d_PREFIX, p_size, alphabet);
checkCUDAError("kernel invocation");
// block until the device has completed
cudaThreadSynchronize();
checkCUDAError("kernel synchronization");
// free memory.....
}
And the kernel code (where the error occurs “//for each pattern”):
__global__ void wuKernel(int pitch, unsigned char **d_pattern, unsigned char *d_text, int *d_matches, int m, long n, int blocksize, long shiftsize, int *d_SHIFT, struct prefixArray *d_PREFIX, int p_size, int alphabet) {
int i, j, q, B = 3;
d_matches = 0;
struct d_structAlphabet {
char letter;
unsigned char offset;
} d_lookup[128];
unsigned char m_nSizeOfAlphabet = 1;
unsigned short m_nBitsInShift = (unsigned short) ceil( log( (double) m_nSizeOfAlphabet ) / log( (double) 2 ) )
unsigned int hash, hash1, hash2;
size_t shift, shiftlen, index = m - 1, prefixhash;
for ( i = 0; i < shiftsize; i++ )
d_SHIFT[i] = m - B + 1;
//Map characters to index
for ( i = 0; i <= 128; ++i ) {
if ( ( i >= 33 ) && ( i <= 126 ) ) {
d_lookup[i].letter = (char) i;
d_lookup[i].offset = m_nSizeOfAlphabet++;
}
else {
d_lookup[i].letter = ' '; // table is defaulted to whitespace
d_lookup[i].offset = 0;
}
}
//for each pattern
for ( j = 0; j < p_size; ++j ) {
//add each 3-character subpattern (similar to q-grams)
for ( q = m; q >= B; --q ) {
hash = d_lookup[d_pattern[j][q - 2 - 1]].offset; // bring in offsets of X in pattern j
hash <<= m_nBitsInShift;
hash += d_lookup[d_pattern[j][q - 1 - 1]].offset;
hash <<= m_nBitsInShift;
hash += d_lookup[d_pattern[j][q - 1]].offset;
shiftlen = m - q;
d_SHIFT[hash] = MIN( d_SHIFT[hash], shiftlen );
//calculate the hash of the prefixes for each pattern
if ( shiftlen == 0 ) {
prefixhash = d_lookup[d_pattern[j][0]].offset;
prefixhash <<= m_nBitsInShift;
prefixhash += d_lookup[d_pattern[j][1]].offset;
d_PREFIX[hash].value[d_PREFIX[hash].size] = prefixhash;
d_PREFIX[hash].index[d_PREFIX[hash].size] = j;
d_PREFIX[hash].size++;
}
}
}
//more code.....
}
I was thinking maybe i do something wrong with the cudamallocpitch or i dont use the pitch correctly. Any ideas?