wrong number of threads when writing in global memory

Hi,

I’m writing a parallel sequence alignment application. I have to compute a matrix antidiagonal by antidiagonal. I start the computing from the high left corner. So the first antidiagonal has a size of 1 cell, the second 2 cells and so on until the maximal diagonal size is reached.

Here’s my kernel code :

__global__ void AlignmentScoreKernel(Matrix score, charArray s, charArray t, int max_diag_size, int nbr_diag, int* data_diag) {

	

	//IDs du thread courant

	int blockID = blockIdx.x;

	int threadID = threadIdx.x;

	int index = blockID*BLOCK_SIZE + threadID;

	

	printf("hehe I am %d %d %d\n", blockID, threadID, index);

	//taille de la diagonale

	int diag_size = 1;

	

	//valeur de la diagonale

	int value_diag = 0;

	

	//calcul de la matrice

	for(int d=0; d<nbr_diag; d++) {

		if(index <= diag_size) {

			//calcul des indices de la case inferieure droite de l'antidiagonale

			int di, dj;

			if(d < s.length) {

				di = d + 1;

				dj = 1;

			}

			else {

				di = s.length;

				dj = d - s.length + 2;

			}

			

			//calcul de l'antidiagonale (un element par thread)

			value_diag = sim_global_on_device(data_diag[index*2],

											  data_diag[index*2+1],

											  data_diag[index*2+2],

											  s.values[di-index-1],

											  t.values[dj+index-1]);

											  

											  

			// c'est cette ligne qui pose probleme 

			SetElement(score, di-index, dj+index, value_diag);

			

			// synchroniser les threads pour etre sur que toutes les valeur soient ecrites avant de mettre a jour data_diag

			__syncthreads();

			

			//mise a jour de data_diag

			//si on n'a pas encore atteint le fond de la matrice (on descend)

			if(d < s.length-1) {

				if(index == 0) {

					data_diag[0] = GetElement(score, di+1, dj-1);

					data_diag[1] = GetElement(score, di, dj-1);

				}

				

				if(index == diag_size) {

					data_diag[index*2+2] = GetElement(score, di-index-1, dj+index+1);

				}

			}

			//si on a atteint le fond de la matrice (on decale vers la droite)

			else {

				if(index == diag_size-1) {

					data_diag[index*2+2] = GetElement(score, di-index-1, dj+index+1);

				}

			}

			data_diag[index*2] = GetElement(score, di-index, dj+index);

			data_diag[index*2+1] = GetElement(score, di-index-1, dj+index);

			

			//mise a jour de la taille de la diagonale

			//on augmente la taille tant qu'on n'a pas atteint le maximum

			if(diag_size < max_diag_size) {

				if( ((d < s.length) && (s.length <= t.length)) || ((d < t.length-1) && (s.length > t.length)) ) {

					diag_size++;

				}

			}

			//on diminue la taille quand on entre dans le coin inferieur droit de la matrice

			if( ((s.length >= t.length) && (d >= s.length-1)) || ((s.length < t.length) && (d >= t.length-1)) ) {

				diag_size--;

			}

			__syncthreads();

		}

	}

}

The line SetElement(score, di-index, dj+index, value_diag); brings the problem. I compiled the code in emulation mode to see exactly the number of executed threads with the printf line at the begining of the kernel. Only 3 threads are executed instead of min(s.length, t.length).

Do you know what’s going on?

Thanks in advance!

It seems like there’s a deadlock somewhere, probably because of both “__syncthreads()” but I don’t know how to solve that problem, they cannot be removed. Any ideas?

The _syncthreads() call inside a conditional statement isn’t a good idea. As soon as there is warp divergence, you have deadlock. If you move it to the bottom of the loop, it should be safe.

Ok I moved the __syncthreads() to the bottom of the loop. Now the program never ends.

Now here’s the code:

__global__ void AlignmentScoreKernel(Matrix score, charArray s, charArray t, int max_diag_size, int nbr_diag, int* data_diag) {

	

	//IDs du thread courant

	int blockID = blockIdx.x;

	int threadID = threadIdx.x;

	int index = blockID*BLOCK_SIZE + threadID;

	

	printf("hehe I am %d %d %d\n", blockID, threadID, index);

	//taille de la diagonale

	int diag_size = 1;

	

	//valeur de la diagonale

	int value_diag = 0;

	

	//calcul de la matrice

	for(int d=0; d<nbr_diag; d++) {

		if(index < diag_size) {

			//calcul des indices de la case inferieure droite de l'antidiagonale

			int di, dj;

			if(d < s.length) {

				di = d + 1;

				dj = 1;

			}

			else {

				di = s.length;

				dj = d - s.length + 2;

			}

			

			//calcul de l'antidiagonale (un element par thread)

			value_diag = sim_global_on_device(data_diag[index*2],

											  data_diag[index*2+1],

											  data_diag[index*2+2],

											  s.values[di-index-1],

											  t.values[dj+index-1]);

											  

											  

			// c'est cette ligne qui pose probleme 

			SetElement(score, di-index, dj+index, value_diag);

			

			//mise a jour de data_diag

			//si on n'a pas encore atteint le fond de la matrice (on descend)

			if(d < s.length-1) {

				if(index == 0) {

					data_diag[0] = GetElement(score, di+1, dj-1);

					data_diag[1] = GetElement(score, di, dj-1);

				}

				

				if(index == diag_size) {

					data_diag[index*2+2] = GetElement(score, di-index-1, dj+index+1);

				}

			}

			//si on a atteint le fond de la matrice (on decale vers la droite)

			else {

				if(index == diag_size-1) {

					data_diag[index*2+2] = GetElement(score, di-index-1, dj+index+1);

				}

			}

			data_diag[index*2] = GetElement(score, di-index, dj+index);

			data_diag[index*2+1] = GetElement(score, di-index-1, dj+index);

			

			//mise a jour de la taille de la diagonale

			//on augmente la taille tant qu'on n'a pas atteint le maximum

			if(diag_size < max_diag_size) {

				if( ((d < s.length) && (s.length <= t.length)) || ((d < t.length-1) && (s.length > t.length)) ) {

					diag_size++;

				}

			}

			//on diminue la taille quand on entre dans le coin inferieur droit de la matrice

			if( ((s.length >= t.length) && (d >= s.length-1)) || ((s.length < t.length) && (d >= t.length-1)) ) {

				diag_size--;

			}

		}

		__syncthreads();

	}

}

At the line

SetElement(score, di-index, dj+index, value_diag);

I’m writing an element in a matrix cell. Every threads have to execute this line before they could continue with several “GetElement”. How can I manage to do this? I tried with __threadfence() but there’s the same problem with __syncthreads().

What is wrong with just breaking the conditional in “half” with a _syncthreads() call in the middle?

With a __syncthreads() after the “SetElement”, the first thread just block on the __syncthread and never continues.

Otherwise, there’s only 2 threads running instead of 16 (I set the block size at 8 and there’s 2 block to launch in this case but it is only testing data).

I mean something like this:

__global__ void AlignmentScoreKernel(Matrix score, charArray s, charArray t, int max_diag_size, int nbr_diag, int* data_diag) {

	

	//IDs du thread courant

	int blockID = blockIdx.x;

	int threadID = threadIdx.x;

	int index = blockID*BLOCK_SIZE + threadID;

	

	printf("hehe I am %d %d %d\n", blockID, threadID, index);

	//taille de la diagonale

	int diag_size = 1;

	

	//valeur de la diagonale

	int value_diag = 0;

	

	//calcul de la matrice

	for(int d=0; d<nbr_diag; d++) {

		if(index < diag_size) {

			//calcul des indices de la case inferieure droite de l'antidiagonale

			int di, dj;

			if(d < s.length) {

				di = d + 1;

				dj = 1;

			}

			else {

				di = s.length;

				dj = d - s.length + 2;

			}

			

			//calcul de l'antidiagonale (un element par thread)

			value_diag = sim_global_on_device(data_diag[index*2],

											  data_diag[index*2+1],

											  data_diag[index*2+2],

											  s.values[di-index-1],

											  t.values[dj+index-1]);

											  

											  

			// c'est cette ligne qui pose probleme

			SetElement(score, di-index, dj+index, value_diag);

		}

		__syncthreads();

		if(index < diag_size) {

			//mise a jour de data_diag

			//si on n'a pas encore atteint le fond de la matrice (on descend)

			if(d < s.length-1) {

				if(index == 0) {

					data_diag[0] = GetElement(score, di+1, dj-1);

					data_diag[1] = GetElement(score, di, dj-1);

				}

				

				if(index == diag_size) {

					data_diag[index*2+2] = GetElement(score, di-index-1, dj+index+1);

				}

			}

			//si on a atteint le fond de la matrice (on decale vers la droite)

			else {

				if(index == diag_size-1) {

					data_diag[index*2+2] = GetElement(score, di-index-1, dj+index+1);

				}

			}

			data_diag[index*2] = GetElement(score, di-index, dj+index);

			data_diag[index*2+1] = GetElement(score, di-index-1, dj+index);

			

			//mise a jour de la taille de la diagonale

			//on augmente la taille tant qu'on n'a pas atteint le maximum

			if(diag_size < max_diag_size) {

				if( ((d < s.length) && (s.length <= t.length)) || ((d < t.length-1) && (s.length > t.length)) ) {

					diag_size++;

				}

			}

			//on diminue la taille quand on entre dans le coin inferieur droit de la matrice

			if( ((s.length >= t.length) && (d >= s.length-1)) || ((s.length < t.length) && (d >= t.length-1)) ) {

				diag_size--;

			}

		}

		__syncthreads();

	}

}

Unless there is something odd going in inside the functions you haven’t shown, I don’t see how this can deadlock.

OK I misunderstood what you meant by “breaking the conditional” ^^

Now, more threads are running but it’s still blocking somewhere…

I probably missed something :(