#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <cutil_inline.h>
#include <cuda.h>
#include <cuPrintf.cu>
#include "functions.h"
#define CUPRINTF cuPrintf
#define VALUESIZE 100
// Global Variables
int results = 0;
int verbose = 0;
int memory_usage = 0;
int create_data = 0;
struct prefixArray {
int value[VALUESIZE]; //An array of hash values
int size; //Hold the total size of the value array
int index[VALUESIZE]; //Store the pattern number
};
__global__ void wuKernel( unsigned char **d_pattern, unsigned char *d_text, int *d_matches, int m, int n, int blocksize, long shiftsize, int *d_SHIFT, struct prefixArray *d_PREFIX, int p_size ) {
int d_verbose = 1;
int idx = blockDim.x * blockIdx.x + threadIdx.x; //thread index
int start = idx * n / ( blockDim.x * blocksize );
int stop = start + n / ( blockDim.x * blocksize );
int startThread = threadIdx.x * ( m / blockDim.x );
int stopThread = startThread + ( m / blockDim.x );
__syncthreads();
int i, j, q, B = 3;
unsigned int hash;
//The original paper uses Hbits = 5 in the example code given
unsigned short m_nBitsInShift = 4;
size_t shiftlen, prefixhash;
//default to m - B + 1 for shift;
for ( i = 0; i < shiftsize; i++ )
d_SHIFT[i] = m - B + 1;
//Preprocessing phase
//for each pattern
for ( j = 0; j < p_size; ++j ) {
//add each 3-character subpattern (similar to q-grams)
for ( q = m; q >= B; --q ) {
hash = d_pattern[j][q - 2 - 1]; // bring in offsets of X in pattern j
hash <<= m_nBitsInShift;
hash += d_pattern[j][q - 1 - 1];
hash <<= m_nBitsInShift;
hash += d_pattern[j][q - 1];
CUPRINTF("hash = %i %i %i %i\n", hash, d_pattern[j][q - 2 - 1], d_pattern[j][q - 2], d_pattern[j][q - 1]);
shiftlen = m - q;
d_SHIFT[hash] = MIN2( d_SHIFT[hash], shiftlen );
//calculate the hash of the prefixes for each pattern
if ( shiftlen == 0 ) {
prefixhash = d_pattern[j][0];
prefixhash <<= m_nBitsInShift;
prefixhash += d_pattern[j][1];
//Error while trying to access d_PREFIX (uncomment one of the 3 lines)
// d_PREFIX[hash].value[d_PREFIX[hash].size] = prefixhash;
// d_PREFIX[hash].index[d_PREFIX[hash].size] = j;
// d_PREFIX[hash].size++;
}
}
}
}
int main( int argc, char** argv)
{
int m, n, alphabet, p_size;
unsigned int i;
char *text_filename = ( char * ) malloc ( sizeof( char ) * 100 );
char *pattern_filename = ( char * ) malloc ( sizeof( char ) * 100 );
// Define grid and block size
int numThreadsPerBlock, numBlocks;
int sharedMemSize = 16000;
// Scan command line arguments
m = atoi(argv[1]);
n = atoi(argv[2]);
p_size = atoi(argv[3]);
alphabet = atoi(argv[4]);
numBlocks = atoi(argv[5]);
numThreadsPerBlock = atoi(argv[6]);
// Data paths
text_filename = "data/text";
pattern_filename = "data/pattern";
// HOST malloc
unsigned char *text = ( unsigned char * ) malloc ( n * sizeof ( unsigned char ) );
unsigned char **pattern = ( unsigned char ** ) malloc ( m * sizeof( unsigned char * ));
if( pattern == NULL )
fail("Failed to allocate array!\n");
for( i = 0; i < m; i++ ) {
pattern[i] = ( unsigned char * ) malloc ( p_size * sizeof( unsigned char ));
if( pattern[i] == NULL )
fail("Failed to allocate array!\n");
}
int *cuda_matches = ( int * ) malloc ( sizeof( int ) * m );
int *results_array = ( int * ) malloc ( sizeof( int ) * ( ( n - m + 1) / m ) );
load_files( pattern, text, m, n, pattern_filename, text_filename, p_size );
// Device Pointers
unsigned char **d_pattern;
unsigned char *h_temp[p_size], *d_text;
int *d_matches;
// Create 2D Array
cudaMalloc( (void **)&d_pattern, m * sizeof( unsigned char * ) );
for( i = 0; i < p_size; i++ )
cudaMalloc( (void **)&h_temp[i], m * sizeof( unsigned char ) );
cudaMemcpy(d_pattern, h_temp, m * sizeof( unsigned char * ), cudaMemcpyHostToDevice);
for( i = 0; i < p_size; i++ )
cudaMemcpy(h_temp[i], pattern[i], m * sizeof( unsigned char ), cudaMemcpyHostToDevice);
checkCUDAError("Memcpy");
cudaMalloc( (void **) &d_text, n * sizeof ( unsigned char ) );
checkCUDAError("Malloc");
cudaMalloc( (void **) &d_matches, m * sizeof ( int ) );
checkCUDAError("Malloc");
cudaMemcpy( d_text, text, n * sizeof ( unsigned char ), cudaMemcpyHostToDevice );
cudaMemcpy( d_matches, cuda_matches, m * sizeof( int * ), cudaMemcpyHostToDevice );
printf(" * Copying host to device array...\n");
// Threads init
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
// WU CUDA Run
long shiftsize = 32768;
int *d_SHIFT;
cudaMalloc( (void **) &d_SHIFT, sizeof( int ) * shiftsize );
checkCUDAError("Malloc");
struct prefixArray *d_PREFIX;
cudaMalloc( (void **) &d_PREFIX, sizeof( struct prefixArray ) * shiftsize );
checkCUDAError("Malloc");
// Kernel Launch
cudaPrintfInit();
wuKernel<<< dimGrid, dimBlock, sharedMemSize >>>( d_pattern, d_text, d_matches, m, n, numBlocks, shiftsize, d_SHIFT, d_PREFIX, p_size );
checkCUDAError("Kernel Invocation");
// Block until the device has completed
cudaThreadSynchronize();
checkCUDAError("Kernel Synchronization");
// Copy Device mem to Host mem
for( int i = 0; i < p_size; i++)
cudaMemcpy( pattern[i], h_temp[i], m * sizeof( unsigned char ), cudaMemcpyDeviceToHost);
//Print GPU results
cudaPrintfDisplay(stdout, true);
// de-allocation
cudaPrintfEnd();
//freeing device memory
cudaThreadExit();
// freeing host memory
return 0;
}
I tried narrowing it down (it is 700 lines :p)
It’s an algorithm for multiple pattern matching. Its a port from the cpu version, to cuda.
First runs the cpu (not shown in the snippet) and then the gpu.
Assume that everything else load nicely.
The problem is in the kernel where i have the appropriate comment. Uncomenting said lines, results in the error mentioned above. Obviously there is a problem with the struct.
I also tried to create a struct in main and the cudamemcpy it to the struct on the gpu, but nothing changed.