Updating and comparing the content of a matrix using CUDA

I have a matrix xAss[posID] which contains only “1” or “0”. I want to update the content of this matrix xAss[posID] based on the following illustration :

Per-3 blocks of the matrix xAss[posID], If I do sum operation for its each threadIdx.x, the result should not more than “1”. For example :

  • if I sum the xAss[0] + xAss[5] + xAss[10] , then the result will be equal to “2”. If this condition happens, I have to check whether xAss[0] or xAss[5] or xAss[10] have the value of “1”:
    • if xAss[0] = “1”, xAss[5] = “1”, and xAss[10] = “0”, then:
      • I have to compare the ChanGain[0] and ChanGain[5], if ChanGain[0] > ChanGain[5], then xAssUpdate[0] will be assigned as “1”, xAssUpdate[5] will be “0”, and xAssUpdate[10] will be “0”
    • if xAss[0] = “1”, xAss[5] = “0”, and xAss[10] = “1”, then:
      • I have to compare the ChanGain[0] and ChanGain[10], if ChanGain[0] > ChanGain[10], then xAssUpdate[0] will be assigned as “1”, xAssUpdate[5] will be “0”, and xAssUpdate[10] will be “0”

In the picture, because the ChanGain[0] > ChanGain[5], so the xAssUpdate[0] will be assigned as “1”, xAssUpdate[5] = “0”, and xAssUpdate[10] = “0”

Another example is:

  • if I sum the xAss[4] + xAss[9] + xAss[14] , then the result will be equal to “3”. If this condition happens, I have to compare the ChanGain[4], ChanGain[9], and ChanGain[10]:
    • if ChanGain[4] > ChanGain[9] and ChanGain[4] > ChanGain[14], then xAssUpdate[4] will be assigned as “1”, xAssUpdate[9] will be “0”, and xAssUpdate[14] will be “0”
    • if ChanGain[9] > ChanGain[4] and ChanGain[9] > ChanGain[14], then xAssUpdate[9] will be assigned as “1”, xAssUpdate[4] will be “0”, and xAssUpdate[14] will be “0”
    • if ChanGain[14] > ChanGain[4] and ChanGain[14] > ChanGain[9], then xAssUpdate[14] will be assigned as “1”, xAssUpdate[4] will be “0”, and xAssUpdate[9] will be “0”

In the picture, because the ChanGain[4] > ChanGain[9] and ChanGain[4] > ChanGain[14], so the xAssUpdate[4] will be assigned as “1”, xAssUpdate[9] = “0”, and xAssUpdate[14] = “0”

If, the sum of each threadId.x per-3 blocks doesn’t produce more than “1”, then xAssUpdate[posID] will be assigned as xAss[posID], for example:

  • if I sum the xAss[1] + xAss[6] + xAss[11], then the result will be equal to “1”, If this condition happens, then :
    • I will assign xAssUpdate[1] = xAss[1], xAssUpdate[6] = xAss[6], and xAssUpdate[11] = xAss[11].

I have made a kernel function as follows :

__global__ void Update_xAss(int *d_xAss2, int *d_xAss2New, double *d_ChanGainAll2)
{
	int posID = threadIdx.x + blockIdx.x * blockDim.x; // blockDim.x = 3
	int d, J;
	J = 3; //User number = 3
	//gridDim.x = Total blocks used in this simulation = 6

	for(d = 0; d < (gridDim.x/J); d++)
	{
		if((d_xAss2[threadIdx.x + J*d*blockDim.x] + d_xAss2[threadIdx.x + (J*d+1)*blockDim.x] + d_xAss2[threadIdx.x + (J*d+2)*blockDim.x]) > 1)
		{
			if((d_xAss2[threadIdx.x + J*d*blockDim.x] == 1) && (d_xAss2[threadIdx.x + (J*d+1)*blockDim.x] == 1) && (d_xAss2[threadIdx.x + (J*d+2)*blockDim.x] == 1))
			{
				if((d_ChanGainAll2[threadIdx.x + J*d*blockDim.x] >= d_ChanGainAll2[threadIdx.x + (J*d+1)*blockDim.x]) && (d_ChanGainAll2[threadIdx.x + J*d*blockDim.x] >= d_ChanGainAll2[threadIdx.x + (J*d+2)*blockDim.x]))
				{
					(d_xAss2New[threadIdx.x + (J*d+1)*blockDim.x]) = (d_xAss2New[threadIdx.x + (J*d+2)*blockDim.x]) = 0;
				}

				if((d_ChanGainAll2[threadIdx.x + (J*d+1)*blockDim.x] >= d_ChanGainAll2[threadIdx.x + J*d*blockDim.x]) && (d_ChanGainAll2[threadIdx.x + (J*d+1)*blockDim.x] >= d_ChanGainAll2[threadIdx.x + (J*d+2)*blockDim.x]))
				{
					(d_xAss2New[threadIdx.x + J*d*blockDim.x]) = (d_xAss2New[threadIdx.x + (J*d+2)*blockDim.x]) = 0;
				}

				if((d_ChanGainAll2[threadIdx.x + (J*d+2)*blockDim.x] >= d_ChanGainAll2[threadIdx.x + J*d*blockDim.x]) && (d_ChanGainAll2[threadIdx.x + (J*d+2)*blockDim.x] >= d_ChanGainAll2[threadIdx.x + (J*d+1)*blockDim.x]))
				{
					(d_xAss2New[threadIdx.x + J*d*blockDim.x]) = (d_xAss2New[threadIdx.x + (J*d+1)*blockDim.x]) = 0;
				}
			}

			if((d_xAss2[threadIdx.x + J*d*blockDim.x] == 1) && (d_xAss2[threadIdx.x + (J*d+1)*blockDim.x] == 1) && (d_xAss2[threadIdx.x + (J*d+2)*blockDim.x] == 0))
			{
				if((d_ChanGainAll2[threadIdx.x + J*d*blockDim.x] >= d_ChanGainAll2[threadIdx.x + (J*d+1)*blockDim.x]))
				{
					(d_xAss2New[threadIdx.x + (J*d+1)*blockDim.x]) = 0;
				}

				else
				{
					(d_xAss2New[threadIdx.x + J*d*blockDim.x]) = 0;
				}
			}

			if((d_xAss2[threadIdx.x + J*d*blockDim.x] == 1) && (d_xAss2[threadIdx.x + (J*d+1)*blockDim.x] == 0) && (d_xAss2[threadIdx.x + (J*d+2)*blockDim.x] == 1))
			{
				if((d_ChanGainAll2[threadIdx.x + J*d*blockDim.x] >= d_ChanGainAll2[threadIdx.x + (J*d+2)*blockDim.x]))
				{
					(d_xAss2New[threadIdx.x + (J*d+2)*blockDim.x]) = 0;
				}

				else
				{
					(d_xAss2New[threadIdx.x + J*d*blockDim.x]) = 0;
				}
			}

			if((d_xAss2[threadIdx.x + J*d*blockDim.x] == 0) && (d_xAss2[threadIdx.x + (J*d+1)*blockDim.x] == 1) && (d_xAss2[threadIdx.x + (J*d+2)*blockDim.x] == 1))
			{
				if((d_ChanGainAll2[threadIdx.x + (J*d+1)*blockDim.x] >= d_ChanGainAll2[threadIdx.x + (J*d+2)*blockDim.x]))
				{
					(d_xAss2New[threadIdx.x + (J*d+2)*blockDim.x]) = 0;
				}

				else
				{
					(d_xAss2New[threadIdx.x + (J*d+1)*blockDim.x]) = 0;
				}
			}
		}
		else
		{
			d_xAss2New[threadIdx.x + J*d*blockDim.x] = d_xAss2[threadIdx.x + J*d*blockDim.x];
			d_xAss2New[threadIdx.x + (J*d+1)*blockDim.x] = d_xAss2[threadIdx.x + (J*d+1)*blockDim.x];
			d_xAss2New[threadIdx.x + (J*d+2)*blockDim.x] = d_xAss2[threadIdx.x + (J*d+2)*blockDim.x];
		}
	}
}

My problem is there are too many conditional that I have made. Is there any other ways to make this conditional statement more efficiently?
Because in my other cases, I will have more large number of blocks (gridDim.x = 350) and threads (blockDim.x = 10), and if I use the same approach as I did in this example, then I will need to consider a lot of conditional statement (maybe several hundreds of conditional statement).

Kindly need anyone advises and suggestions.

Thank you so much.