Strange problem with kernel launch

Hi!

I’ve wrote the following code:

typedef float2 Complex;

int main(int argc, char *argv[]){

...

int N_int = 2560;

// Load input from file

Complex *sig = load_input("sig.txt",&sig_length);

Complex *sig_d = load_input("sig_d.txt",&sig_length);

...

pthread_t thds[NUM_OF_BLKS]; // NUM_OF_BLKS == 16

for(i=0; i<NUM_OF_BLKS; ++i){

		tArgs[i].sig = sig;

		tArgs[i].sig_d = sig_d;

		tArgs[i].sig_filt = NULL;

		tArgs[i].finish = (i+1) * (sig_length/NUM_OF_BLKS);

		if(i==0)

			tArgs[i].start = 0;

		else

			tArgs[i].start = tArgs[i-1].finish;	

}	

sig_length-=N_int;

...

Complex *dsig;

if(cudaMalloc((void **) &dsig, sig_length*sizeof(Complex))!=cudaSuccess)

	PrintCUDAError("Error cudaMalloc() dsig in main():");

cudaMemcpy(dsig,sig+N_int,sig_length*sizeof(Complex),cudaMemcpyHostToDevice);

cudaBindTexture(0, texRefsig, dsig, sig_length*sizeof(Complex));

Complex *dsig_filt;

if(cudaMalloc((void **) &dsig_filt, sig_length*sizeof(Complex))!=cudaSuccess)

	PrintCUDAError("Error cudaMalloc() dsig_filt in main():");

...

TIMER_START;

// The following algorithm calculate some values in blocks of 16384 elements each. At the end, tArgs[i]->sig_filt contains the 16384 elements calculated by i-th thread.  

pthread_create(&thds[0],NULL,GALfilter,(void *) &tArgs[0]);

pthread_join(thds[0],NULL);

memcpy(tmp,tArgs[0].sig_filt+N_int,(tArgs[0].finish-N_int)*sizeof(Complex));

for(i=1; i<NUM_OF_BLKS; ++i)

	pthread_create(&thds[i],NULL,GALfilter,(void *) &tArgs[i]);

for(i=1; i<NUM_OF_BLKS; ++i){

	pthread_join(thds[i],NULL);

	memcpy(tmp+(i*(tArgs[i].finish-tArgs[i].start)-N_int),tArgs[i].sig_filt,(tArgs[i].finish-tArgs[i].start)*sizeof(Complex));

}

cudaMemcpy(dsig_filt,tmp,sig_length*sizeof(Complex),cudaMemcpyHostToDevice);

cudaBindTexture(0, texRefsigfilt, dsig_filt, sig_length*sizeof(Complex));

dim3 dimGrid((Nblock-1)/block_size+((Nblock-1)%block_size == 0 ? 0 : 1),1,1);

dim3 dimBlock(block_size,1,1);

ambfunc_kern<<<dimGrid,dimBlock>>>(dhNc,dhNblock,Nblock,block_size,nxtPow2Nblock,dsnap_shot

,dtrsnap_shot);

cudaThreadSynchronize();

....

}

The problem arise when my program wants to start ambfunc_kern(). It returns me this error:

cutilCheckMsg() CUTIL CUDA error: fftSmem1024D_kernel<FFT_FORWARD> execution failed

 in file </home/buildmeister/build/rel/gpgpu/toolkit/r3.1/cufft/src/accel/interface/sp1D.cu>, line 265 : unspecified launch failure.

Curiously, if I don’t execute the algorithm above with the pthreads, the kernel works normally. I don’t know why. It doesn’t seem an array out of bounds problem.

Hi!

I’ve wrote the following code:

typedef float2 Complex;

int main(int argc, char *argv[]){

...

int N_int = 2560;

// Load input from file

Complex *sig = load_input("sig.txt",&sig_length);

Complex *sig_d = load_input("sig_d.txt",&sig_length);

...

pthread_t thds[NUM_OF_BLKS]; // NUM_OF_BLKS == 16

for(i=0; i<NUM_OF_BLKS; ++i){

		tArgs[i].sig = sig;

		tArgs[i].sig_d = sig_d;

		tArgs[i].sig_filt = NULL;

		tArgs[i].finish = (i+1) * (sig_length/NUM_OF_BLKS);

		if(i==0)

			tArgs[i].start = 0;

		else

			tArgs[i].start = tArgs[i-1].finish;	

}	

sig_length-=N_int;

...

Complex *dsig;

if(cudaMalloc((void **) &dsig, sig_length*sizeof(Complex))!=cudaSuccess)

	PrintCUDAError("Error cudaMalloc() dsig in main():");

cudaMemcpy(dsig,sig+N_int,sig_length*sizeof(Complex),cudaMemcpyHostToDevice);

cudaBindTexture(0, texRefsig, dsig, sig_length*sizeof(Complex));

Complex *dsig_filt;

if(cudaMalloc((void **) &dsig_filt, sig_length*sizeof(Complex))!=cudaSuccess)

	PrintCUDAError("Error cudaMalloc() dsig_filt in main():");

...

TIMER_START;

// The following algorithm calculate some values in blocks of 16384 elements each. At the end, tArgs[i]->sig_filt contains the 16384 elements calculated by i-th thread.  

pthread_create(&thds[0],NULL,GALfilter,(void *) &tArgs[0]);

pthread_join(thds[0],NULL);

memcpy(tmp,tArgs[0].sig_filt+N_int,(tArgs[0].finish-N_int)*sizeof(Complex));

for(i=1; i<NUM_OF_BLKS; ++i)

	pthread_create(&thds[i],NULL,GALfilter,(void *) &tArgs[i]);

for(i=1; i<NUM_OF_BLKS; ++i){

	pthread_join(thds[i],NULL);

	memcpy(tmp+(i*(tArgs[i].finish-tArgs[i].start)-N_int),tArgs[i].sig_filt,(tArgs[i].finish-tArgs[i].start)*sizeof(Complex));

}

cudaMemcpy(dsig_filt,tmp,sig_length*sizeof(Complex),cudaMemcpyHostToDevice);

cudaBindTexture(0, texRefsigfilt, dsig_filt, sig_length*sizeof(Complex));

dim3 dimGrid((Nblock-1)/block_size+((Nblock-1)%block_size == 0 ? 0 : 1),1,1);

dim3 dimBlock(block_size,1,1);

ambfunc_kern<<<dimGrid,dimBlock>>>(dhNc,dhNblock,Nblock,block_size,nxtPow2Nblock,dsnap_shot

,dtrsnap_shot);

cudaThreadSynchronize();

....

}

The problem arise when my program wants to start ambfunc_kern(). It returns me this error:

cutilCheckMsg() CUTIL CUDA error: fftSmem1024D_kernel<FFT_FORWARD> execution failed

 in file </home/buildmeister/build/rel/gpgpu/toolkit/r3.1/cufft/src/accel/interface/sp1D.cu>, line 265 : unspecified launch failure.

Curiously, if I don’t execute the algorithm above with the pthreads, the kernel works normally. I don’t know why. It doesn’t seem an array out of bounds problem.

memory leak, possibly?

memory leak, possibly?

But where’s memory leak?

But where’s memory leak?

I’ve tried to call cudaSetDevice(0); but I’ve same problem. I can’t see memory leak, because first I “merge” thread arrays into one on CPU. If it were a memory leak, it would signal the error at this point with a Segmentation fault. Instead, it continues.

I’ve tried to call cudaSetDevice(0); but I’ve same problem. I can’t see memory leak, because first I “merge” thread arrays into one on CPU. If it were a memory leak, it would signal the error at this point with a Segmentation fault. Instead, it continues.

I’m not a pthread connoisseur, but I know that the cuda context is bound to a cpu-thread. Now if you could start all pthreads (including the first) and call pthread_join from the non-pthread main() routine, the problem might go away.

I don’t know if this is possible with pthreads. If not, you could try calling pthread_exit() for the first pthread (after all other threads have joined), and if you are not thrown out altogether, this could have the desired effect as well.

The FFT_FORWARD is strange, but it just might point to a thread-problem.

I’m not a pthread connoisseur, but I know that the cuda context is bound to a cpu-thread. Now if you could start all pthreads (including the first) and call pthread_join from the non-pthread main() routine, the problem might go away.

I don’t know if this is possible with pthreads. If not, you could try calling pthread_exit() for the first pthread (after all other threads have joined), and if you are not thrown out altogether, this could have the desired effect as well.

The FFT_FORWARD is strange, but it just might point to a thread-problem.

In every thread I already call pthread_exit(), but doesn’t seem change anything. I don’t understand well when you say “you could start all pthreads (including the first) and call pthread_join from the non-pthread main() routine”. Don’t I already do it?

In every thread I already call pthread_exit(), but doesn’t seem change anything. I don’t understand well when you say “you could start all pthreads (including the first) and call pthread_join from the non-pthread main() routine”. Don’t I already do it?

I thought (but don’t know if correct):[codebox]TIMER_START;

// The following algorithm calculate some values in blocks of 16384 elements each. At the end, tArgs[i]->sig_filt contains the 16384 elements calculated by i-th thread.

for(i=0; i<NUM_OF_BLKS; ++i)

pthread_create(&thds[i],NULL,GALfilter,(void *) &tArgs[i]);

for(i=0; i<NUM_OF_BLKS; ++i){

pthread_join(thds[i],NULL);

memcpy(tmp+(i*(tArgs[i].finish-tArgs[i].start)-N_int),tArgs[i].sig_filt,(tArgs[i].finish-tArgs[i].start)*sizeof(Complex));

}

[/codebox]

I thought (but don’t know if correct):[codebox]TIMER_START;

// The following algorithm calculate some values in blocks of 16384 elements each. At the end, tArgs[i]->sig_filt contains the 16384 elements calculated by i-th thread.

for(i=0; i<NUM_OF_BLKS; ++i)

pthread_create(&thds[i],NULL,GALfilter,(void *) &tArgs[i]);

for(i=0; i<NUM_OF_BLKS; ++i){

pthread_join(thds[i],NULL);

memcpy(tmp+(i*(tArgs[i].finish-tArgs[i].start)-N_int),tArgs[i].sig_filt,(tArgs[i].finish-tArgs[i].start)*sizeof(Complex));

}

[/codebox]