Hello,
I’m new in the world of CUDA and I want to try it. Ihaving watched a bit of tutorials, programming exploiting parallelism.
As the first program I tried to do on a cuda is a program that i have do on php.
I have a very long string of characters (near 100k char) and I need to find if it contains a set of other char all the same length.
To do this I tried to use cuda and take advantage of multithreading, one for each sequence of char that i need find.
The base code is the following:
#define N 1000
#define DIM 50
//a = text of 70k char, B=all search therm, all length 50, C=return result, D= length of A
__global__ void helloKern( char* a, char *b, int* c, int d){
int tid = threadIdx.x;
c[tid]=-1;
for(int k=0; k<d; k++){
bool uguali = false;
if(a[k] == b[(tid*DIM)]){ //for every trhead i find my search string
uguali = true;
for(int i=1; i<DIM; i++){
if(a[k+i] != b[(tid*DIM)+i]){
uguali = false;
break;
}
}
}
if(uguali){
c[tid] = k;
}
}
}
int main( void ){
char *frase ="lorem i...... ecc 70kchar";
int trovati[N];
char *dev_frase;
char *dev_ricerca;
int *dev_trovati;
char *ricerca = "find1find2find3...find50"; //every find have 50char
// Cuda malloc and copy
cudaMalloc((void **)&dev_ricerca, N*DIM*sizeof(char));
cudaMalloc((void **)&dev_frase, strlen(frase)*sizeof(char));
cudaMalloc((void **)&dev_trovati, N*sizeof(int));
cudaMemcpy( dev_ricerca, ricerca, N*DIM*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy( dev_frase, frase, strlen(frase)*sizeof(char), cudaMemcpyHostToDevice);
// Do 1000 thread, one for every word
helloKern <<< 1, N >>> (dev_frase, dev_ricerca, dev_trovati, strlen(frase));
cudaMemcpy( trovati, dev_trovati, N*sizeof(int), cudaMemcpyDeviceToHost);
// Then Print out
}
I try compiling, work fine but I have see that memcopy are very fast (microseconds), but the algorithm itself is quite slow, on a text of 70k characters, searching in parallel 1000 words of 50 characters each, it takes about 20 seconds.
I was expecting less time, also considered that such a php regex is applied 1000 times faster (on normal processor)