Hi All!
I’m trying to implement parallel brute force search.
Main idea: all threads suspects that the word, we searching for, exists at position
offset=(blockIdx.x * blockDim.x) + threadIdx.x;
If the word really exists there - thread writes the offset if it less than previous found (to find first word occurance in the text).
But the following code doesn’t work:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
// includes, project
#include <cutil.h>
#include <cuda.h>
#define MEM_SIZE 66559999 // == (((260000-1) * 256) + 255)
#define NUM_BLOCKS 260000
#define NUM_THREADS 256
__device__ int PRESENT(const char *word, const char *text_at_offset, int word_len)
{
while(word_len)
{
if(word[word_len] != text_at_offset[word_len]) return 0; // not present
word_len--;
}
return 1; // present
}
__global__ void BFWrap(const char *d_word, const int *d_word_len, const char *d_text, int *d_word_index)
{
const int offset = (blockIdx.x * blockDim.x) + threadIdx.x; // 0 to 66559999
if(offset > (MEM_SIZE - (*d_word_len))) return; // word can't fit in remainder
int present = PRESENT(d_word, (d_text + offset), (*d_word_len));
if(present)
{
if(offset < (*d_word_index)) *d_word_index = offset;
}
}
int search_word(const char *h_word, const char *d_text)
{
int h_word_index = MEM_SIZE;
int h_word_len = strlen(h_word);
char *d_word = NULL;
int *d_word_len = NULL;
int *d_word_index = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_word, h_word_len));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_word_len, sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_word_index, sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(d_word, h_word, h_word_len, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_word_len, &h_word_len, sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_word_index, &h_word_index, sizeof(int), cudaMemcpyHostToDevice));
// run parallel search!
BFWrap<<<NUM_BLOCKS, NUM_THREADS>>>(d_word, d_word_len, d_text, d_word_index);
CUDA_SAFE_CALL(cudaThreadSynchronize());
// retrieve word index
CUDA_SAFE_CALL(cudaMemcpy(&h_word_index, d_word_index, sizeof(int), cudaMemcpyDeviceToHost));
// free memory
CUDA_SAFE_CALL(cudaFree(d_word));
CUDA_SAFE_CALL(cudaFree(d_word_len));
CUDA_SAFE_CALL(cudaFree(d_word_index));
return h_word_index;
}
int main(int argc, char** argv)
{
CUT_DEVICE_INIT(argc, argv);
FILE *F;
// load text into device
char *h_text = NULL;
char *d_text = NULL;
CUDA_SAFE_CALL( cudaMallocHost( (void**)&h_text, MEM_SIZE ) ); // pinned memory
CUDA_SAFE_CALL(cudaMalloc((void**)&d_text, MEM_SIZE));
CUDA_SAFE_CALL(cudaMemset(d_text, 0, MEM_SIZE));
F = fopen("in.txt", "rb");
int textSize = fread(h_text, 1, MEM_SIZE, F);
fclose(F);
CUDA_SAFE_CALL(cudaMemcpy(d_text, h_text, textSize, cudaMemcpyHostToDevice));
// run word(s) search
unsigned int timer = 0;
float elapsedTimeInMs = 0.0f;
CUT_SAFE_CALL( cutCreateTimer( &timer ) );
CUT_SAFE_CALL( cutStartTimer( timer));
char h_word[] = "zzz";
int h_word_index = search_word(h_word, d_text);
CUT_SAFE_CALL( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
// report results
printf("\n Word '%s' found at %i", h_word, h_word_index);
printf("\n Elapsed time: %f seconds\n", elapsedTimeInMs / (float)1000);
// free memory
CUDA_SAFE_CALL(cudaFreeHost(h_text));
CUDA_SAFE_CALL(cudaFree(d_text));
CUT_EXIT(argc, argv);
}
Any ideas what I’m doing wrong?
Could you please advise how to implement fast search using CUDA?