```
__global__ void sortBlocks(int *a)
{
int i = 2;
__shared__ int temp[THREADS];
while (i <= THREADS)
{
if ((threadIdx.x %i) == 0)
{
int index1 = threadIdx.x + (blockIdx.x * blockDim.x);
int endIndex1 = index1 + i / 2;
int index2 = endIndex1;
int endIndex2 = index2 + i / 2;
int targetIndex = threadIdx.x;
int done = 0;
while (!done)
{
if ((index1 == endIndex1) && (index2 < endIndex2))
temp[targetIndex++] = a[index2++];
else if ((index2 == endIndex2) && (index1 < endIndex1))
temp[targetIndex++] = a[index1++];
else if (a[index1] < a[index2])
temp[targetIndex++] = a[index1++];
else
temp[targetIndex++] = a[index2++];
if ((index1 == endIndex1) && (index2 == endIndex2))
done = 1;
}
}
__syncthreads();
a[threadIdx.x + (blockIdx.x * blockDim.x)] = temp[threadIdx.x];
__syncthreads();
i *= 2;
}
}
```

I know that removing the % from the CUDA operation will speed up the operation.

I did not use % by replacing “if ((threadIdx.x %i) == 0)” with “if ((threadIdx.x &(i -1)) == 0)”, but the computation time did not get faster. What is the reason?

And can you figure out how to reduce computation time like this?