Hi!
I’ve wrote the following code:
typedef float2 Complex;
int main(int argc, char *argv[]){
...
int N_int = 2560;
// Load input from file
Complex *sig = load_input("sig.txt",&sig_length);
Complex *sig_d = load_input("sig_d.txt",&sig_length);
...
pthread_t thds[NUM_OF_BLKS]; // NUM_OF_BLKS == 16
for(i=0; i<NUM_OF_BLKS; ++i){
tArgs[i].sig = sig;
tArgs[i].sig_d = sig_d;
tArgs[i].sig_filt = NULL;
tArgs[i].finish = (i+1) * (sig_length/NUM_OF_BLKS);
if(i==0)
tArgs[i].start = 0;
else
tArgs[i].start = tArgs[i-1].finish;
}
sig_length-=N_int;
...
Complex *dsig;
if(cudaMalloc((void **) &dsig, sig_length*sizeof(Complex))!=cudaSuccess)
PrintCUDAError("Error cudaMalloc() dsig in main():");
cudaMemcpy(dsig,sig+N_int,sig_length*sizeof(Complex),cudaMemcpyHostToDevice);
cudaBindTexture(0, texRefsig, dsig, sig_length*sizeof(Complex));
Complex *dsig_filt;
if(cudaMalloc((void **) &dsig_filt, sig_length*sizeof(Complex))!=cudaSuccess)
PrintCUDAError("Error cudaMalloc() dsig_filt in main():");
...
TIMER_START;
// The following algorithm calculate some values in blocks of 16384 elements each. At the end, tArgs[i]->sig_filt contains the 16384 elements calculated by i-th thread.
pthread_create(&thds[0],NULL,GALfilter,(void *) &tArgs[0]);
pthread_join(thds[0],NULL);
memcpy(tmp,tArgs[0].sig_filt+N_int,(tArgs[0].finish-N_int)*sizeof(Complex));
for(i=1; i<NUM_OF_BLKS; ++i)
pthread_create(&thds[i],NULL,GALfilter,(void *) &tArgs[i]);
for(i=1; i<NUM_OF_BLKS; ++i){
pthread_join(thds[i],NULL);
memcpy(tmp+(i*(tArgs[i].finish-tArgs[i].start)-N_int),tArgs[i].sig_filt,(tArgs[i].finish-tArgs[i].start)*sizeof(Complex));
}
cudaMemcpy(dsig_filt,tmp,sig_length*sizeof(Complex),cudaMemcpyHostToDevice);
cudaBindTexture(0, texRefsigfilt, dsig_filt, sig_length*sizeof(Complex));
dim3 dimGrid((Nblock-1)/block_size+((Nblock-1)%block_size == 0 ? 0 : 1),1,1);
dim3 dimBlock(block_size,1,1);
ambfunc_kern<<<dimGrid,dimBlock>>>(dhNc,dhNblock,Nblock,block_size,nxtPow2Nblock,dsnap_shot
,dtrsnap_shot);
cudaThreadSynchronize();
....
}
The problem arise when my program wants to start ambfunc_kern(). It returns me this error:
cutilCheckMsg() CUTIL CUDA error: fftSmem1024D_kernel<FFT_FORWARD> execution failed
in file </home/buildmeister/build/rel/gpgpu/toolkit/r3.1/cufft/src/accel/interface/sp1D.cu>, line 265 : unspecified launch failure.
Curiously, if I don’t execute the algorithm above with the pthreads, the kernel works normally. I don’t know why. It doesn’t seem an array out of bounds problem.