Odd results then good results ?

Hi!

I wrote a parallel sequence alignment application. It computes an alignment matrix.
To be sure of the results I use a classic sequential algorithm on CPU to compute the same matrix. Then I compare the 2 matrices.
Something strange happens:
I compile the code then I run it.
First run, there’s a huge difference between the 2 matrices.
Second run, there’s still a difference, smaller.
Every following runs give exactly the same (good) response than the classic CPU code.

Why is this happening? And how to resolve this problem?

Thanks in advance!

For example, here’s the output of my program executed on small sequences:

0	-2	-4	-6	-8	-10	-12	-14	-16	-18	-20	-22	-24	-26	-28	-30	-32	-34	-36	-38	-40	-42	-44	-46	-48	

-2	-1	-3	-5	-5	-7	-9	-11	-13	-15	-17	-19	-21	-23	-25	-27	-29	-31	-33	-35	-37	-39	-41	-43	-45	

-4	-3	-2	-4	-4	-6	-8	-10	-12	-12	-14	-16	-18	-20	-22	-24	-26	-28	-30	-32	-34	-36	-38	-40	-42	

-6	-5	-4	-3	-3	-5	-7	-9	-11	-11	-13	-15	-17	-17	-19	-21	-23	-25	-27	-29	-31	-33	-35	-37	-39	

-8	-7	-6	-5	-4	-4	-6	-8	-10	-12	-12	-14	-14	-16	-18	-18	-20	-22	-24	-26	-28	-30	-32	-34	-36	

-10	-9	-8	-7	-6	-5	-5	-7	-7	-9	-11	-13	-15	-15	-17	-19	-19	-19	-21	-23	-25	-27	-29	-31	-33	

-12	-11	-10	-9	-8	-7	-6	-6	-8	-8	-10	-12	-12	-14	-16	-16	-18	-20	-20	-22	-24	-24	-26	-28	-30	

-14	-11	-10	-9	-10	-7	-6	-5	-7	-9	-7	-9	-11	-13	-13	-15	-17	-19	-19	-21	-23	-25	-25	-27	-27	

-16	-13	-12	-11	-10	-9	-8	-7	-4	-6	-8	-8	-10	-12	-14	-14	-16	-16	-18	-20	-20	-22	-24	-26	-28	

-18	-15	-14	-13	-10	-11	-10	-9	-6	-3	-5	-7	-9	-9	-11	-13	-15	-17	-17	-17	-19	-21	-23	-23	-25	

-20	-17	-16	-15	-12	-11	-12	-11	-8	-5	-4	-6	-6	-8	-10	-10	-12	-14	-16	-18	-18	-18	-20	-22	-24	

-22	-19	-18	-17	-14	-13	-12	-13	-10	-7	-6	-5	-7	-5	-7	-9	-11	-13	-15	-15	-17	-19	-19	-19	-21	

-24	-21	-20	-19	-16	-15	-14	-13	-12	-9	-8	-7	-4	-6	-6	-6	-8	-10	-12	-14	-16	-16	-18	-20	-20	

-26	-23	-20	-19	-18	-15	-14	-13	-14	-11	-8	-7	-6	-5	-5	-7	-7	-9	-9	-11	-13	-15	-17	-19	-19	

-28	-25	-22	-21	-20	-17	-16	-15	-12	-13	-10	-9	-8	-7	-6	-6	-8	-6	-8	-10	-10	-12	-14	-16	-18	

-30	-27	-24	-23	-20	-19	-18	-17	-14	-11	-12	-11	-10	-7	-8	-7	-7	-8	-7	-7	-9	-11	-13	-13	-15	

-32	-29	-26	-23	-22	-19	-18	-17	-16	-13	-10	-11	-12	-9	-6	-8	-8	-8	-7	-8	-8	-10	-12	-14	-12	

-34	-31	-28	-25	-24	-21	-20	-19	-16	-15	-12	-11	-12	-11	-8	-7	-9	-7	-9	-8	-7	-9	-9	-11	-13	

-36	-33	-30	-27	-26	-23	-22	-21	-18	-17	-14	-13	-10	-12	-10	-7	-6	-8	-8	-10	-9	-6	-8	-10	-12	

-38	-35	-32	-29	-28	-25	-22	-21	-20	-19	-16	-13	-12	-11	-11	-9	-8	-7	-7	-9	-11	-8	-7	-9	-9	

0	-2	-4	-6	-8	-10	-12	-14	-16	-18	-20	-22	-24	-26	-28	-30	-32	-34	-36	-38	-40	-42	-44	-46	-48	

-2	-1	-3	-5	-5	-7	-9	-11	-13	-15	-17	-19	-21	-23	-25	-27	-29	-33	-35	-35	-37	-39	-41	-43	-45	

-4	-3	-2	-4	-4	-6	-8	-10	-12	-12	-14	-16	-18	-20	-22	-24	-26	-5	-7	-9	-11	-13	-15	-17	-19	

-6	-5	-4	-3	-3	-5	-7	-9	-11	-11	-13	-15	-17	-17	-19	-21	-23	-4	-6	-6	-8	-10	-12	-14	-16	

-8	-7	-6	-5	-4	-4	-6	-8	-10	-12	-12	-14	-14	-16	-18	-18	-20	-6	-5	-7	-7	-7	-9	-11	-13	

-10	-9	-8	-7	-6	-5	-5	-7	-7	-9	-11	-13	-15	-15	-17	-19	-19	-8	-7	-6	-6	-8	-6	-8	-10	

-12	-11	-10	-9	-8	-7	-6	-6	-8	-8	-10	-12	-12	-14	-16	-16	-18	-10	-9	-8	-7	-5	-7	-7	-9	

-14	-11	-10	-9	-10	-7	-6	-5	-7	-9	-7	-9	-11	-13	-13	-15	-17	-12	-9	-10	-9	-7	-6	-8	-6	

-16	-13	-12	-11	-10	-9	-8	-7	-4	-6	-8	-8	-10	-12	-14	-14	-16	-14	-11	-10	-9	-9	-6	-7	-8	

-18	-15	-14	-13	-10	-11	-10	-9	-6	-3	-5	-7	-9	-9	-11	-13	-15	-16	-13	-10	-11	-10	-8	-5	-7	

-20	-17	-16	-15	-12	-11	-12	-11	-8	-5	-4	-6	-6	-8	-10	-10	-12	-14	-15	-12	-11	-10	-10	-7	-6	

-22	-19	-18	-17	-14	-13	-12	-13	-10	-7	-6	-5	-7	-5	-7	-9	-11	-13	-15	-14	-13	-12	-11	-9	-8	

-24	-21	-20	-19	-16	-15	-14	-13	-12	-9	-8	-7	-4	-6	-6	-6	-8	-10	-12	-14	-15	-12	-13	-11	-10	

-26	-23	-20	-19	-18	-15	-14	-13	-14	-11	-8	-7	-6	-5	-5	-7	-7	-9	-9	-11	-13	-14	-13	-13	-10	

-28	-25	-22	-21	-20	-17	-16	-15	-12	-13	-10	-9	-8	-7	-6	-6	-8	-6	-8	-10	-10	-12	-13	-14	-12	

-30	-27	-24	-23	-20	-19	-18	-17	-14	-11	-12	-11	-10	-7	-8	-7	-7	-8	-7	-7	-9	-11	-13	-12	-14	

-32	-29	-26	-23	-22	-19	-18	-17	-16	-13	-10	-11	-12	-9	-6	-8	-8	-8	-7	-8	-8	-10	-12	-14	-11	

-34	-31	-28	-25	-24	-21	-20	-19	-16	-15	-12	-11	-12	-11	-8	-7	-9	-7	-9	-8	-7	-9	-9	-11	-13	

-36	-33	-30	-27	-26	-23	-22	-21	-18	-17	-14	-13	-10	-12	-10	-7	-6	-8	-8	-10	-9	-6	-8	-10	-12	

-38	-35	-32	-29	-28	-25	-22	-21	-20	-19	-16	-13	-12	-11	-11	-9	-8	-7	-7	-9	-11	-8	-7	-9	-9	

diference between matrices : 1286

The first matrix is the good reference computed on CPU, the second is computed on GPU.

Now I run exactly the same code again :

...

2 times the same matrix

...

diference between matrices : 0

I don’t know what’s going on :(

When I execute the code on larger sequences (then the matrix is 10000x10000 sized), the difference between matrices is never zero.

I probably missed something in the code but why is this working sometimes with small sequences??

Here’s my code:

// Matrices are stored in row-major order:

// M(row, col) = *(M.elements + row * M.stride + col)

typedef struct {

	int width;

	int height;

	int* elements;

} Matrix;

// acceder a l'element A[row][col]

__device__ int GetElement(Matrix A, int row, int col) {

	return A.elements[row*A.width + col];

}

// definir l'element A[row][col] avec value

__device__ void SetElement(Matrix A, int row, int col, int value) {

	A.elements[row*A.width + col] = value;

}

//penalite si gap

__device__ int p_on_device(char a, char b) {

	if (a == b)

		return 1;

	else

		return -1;

}

//calcul de similarite global et local

__device__ int sim_global_on_device(int a, int b, int c, char s, char t) {

	int maximum = max(a - G, b + p_on_device(s, t));

	maximum = max(maximum, c - G);

	return maximum;

}

__global__ void AlignmentScoreKernel(Matrix score, charArray s, charArray t, int max_diag_size, int nbr_diag, int* data_diag) {

	

	//IDs du thread courant

	int blockID = blockIdx.x;

	int threadID = threadIdx.x;

	int index = blockID*BLOCK_SIZE + threadID;

	//taille de la diagonale

	int diag_size = 1;

	

	//valeur de la diagonale

	int value_diag = 0;

	

	//calcul de la matrice

	for(int d=0; d<nbr_diag; d++) {

		//printf("hehe I am %d %d %d\n", blockID, threadID, index);

		

		//calcul des indices de la case courante de la matrice

		int di, dj;

		if(d < s.length) {

			di = d + 1;

			dj = 1;

		}

		else {

			di = s.length;

			dj = d - s.length + 2;

		}

		

		di -= index;

		dj += index;

		

		if(index < diag_size) {

			//printf("valeurs pour le calcul de value_diag %d %d %d\n", GetElement(score, di, dj-1), GetElement(score, di-1, dj-1), GetElement(score, di-1, dj));

			//calcul de l'antidiagonale (un element par thread)

			value_diag = sim_global_on_device(GetElement(score, di, dj-1),

											  GetElement(score, di-1, dj-1),

											  GetElement(score, di-1, dj),

											  s.values[di-1],

											  t.values[dj-1]);

			SetElement(score, di, dj, value_diag);

		}

		

		//mise a jour de la taille de la diagonale

		//on augmente la taille tant qu'on n'a pas atteint le maximum

		if(diag_size < max_diag_size) {

			if( ((d < s.length) && (s.length <= t.length)) || ((d < t.length-1) && (s.length > t.length)) ) {

				diag_size++;

			}

		}

		//on diminue la taille quand on entre dans le coin inferieur droit de la matrice

		if( ((s.length >= t.length) && (d >= s.length-1)) || ((s.length < t.length) && (d >= t.length-1)) ) {

			diag_size--;

		}

		__syncthreads();

	}

}

EDIT: should have read the actual code.

What prevents [font=“Courier New”]di[/font] from becoming negative?

di probably becomes negative but only when index > diag_size, so it’s OK I think…

Do you use more than one block?
I think (but haven’t really checked) your kernel were correct only if [font=“Courier New”]__syncthreads()[/font] would synchronize all threads. However, it only synchronizes the threads within each block. Thus, sometimes [font=“Courier New”]GetElement()[/font] reads elements of [font=“Courier New”]score[/font] not yet computed. Repeated calls of the kernel then successively fix this (provided you do not clear [font=“Courier New”]score[/font] in between).

Of course you do, as you have a matrix size of 10000x10000. This also explains why the problem becomes worse with larger matrices.

OK I see the problem now :)

So now the new kernel (no loop anymore) :

__global__ void AlignmentScoreKernel(Matrix score, charArray s, charArray t, int diag_size, int d) {

	

	//IDs du thread courant

	int blockID = blockIdx.x;

	int threadID = threadIdx.x;

	int index = blockID*BLOCK_SIZE + threadID;

	

	//valeur de la diagonale

	int value_diag = 0;

	

	//calcul de la matrice

	if(index < diag_size) {

		//calcul des indices de la case inferieure droite de l'antidiagonale

		int di, dj;

		if(d < s.length) {

			di = d + 1;

			dj = 1;

		}

		else {

			di = s.length;

			dj = d - s.length + 2;

		}

		di -= index;

		dj += index;

		//calcul de l'antidiagonale (un element par thread)

		value_diag = sim_global_on_device(GetElement(score, di, dj-1),

										  GetElement(score, di-1, dj-1),

										  GetElement(score, di-1, dj),

										  s.values[di-1],

										  t.values[dj-1]);

		

		SetElement(score, di, dj, value_diag);

	}

	__syncthreads();

}

And I call it like this:

//taille de la diagonale

	int diag_size = 1;

	

	// Appel du kernel de calcul des scores

	for(int d=0; d<nbr_diag; d++) {

	

		dim3 dimBlock(BLOCK_SIZE, 1);

		dim3 dimGrid((diag_size / dimBlock.x)+1, 1);

		AlignmentScoreKernel<<<dimGrid, dimBlock>>>(score_on_device, s_on_device, t_on_device, diag_size, d);

		cudaThreadSynchronize();

		

		//mise a jour de la taille de la diagonale

		//on augmente la taille tant qu'on n'a pas atteint le maximum

		if(diag_size < max_diag_size) {

			if( ((d < s.length) && (s.length <= t.length)) || ((d < t.length-1) && (s.length > t.length)) ) {

				diag_size++;

			}

		}

		//on diminue la taille quand on entre dans le coin inferieur droit de la matrice

		if( ((s.length >= t.length) && (d >= s.length-1)) || ((s.length < t.length) && (d >= t.length-1)) ) {

			diag_size--;

		}

	}

It seems to work :))

Thanks a lot ;)