How to coalesced save results to global memory when threads have different running times

My program like this, function SubSortCommonState0’s calculate process is the same, there is a loop calculate process in function SubSortCommonState1, loop times is different by different input x,y,result_In, so the function have different running time in different threads.

__device__ uint8_t SubSortCommonState1(uint8_t x, uint8_t y, uint8_t result_In)
{
    uint8_t result;
// do some option, different input x,y,result_In, get different calculate time
    return result;
}
__device__ uint8_t SubSortCommonState0(uint8_t x, uint8_t y)
{
    uint8_t result;
// do some option,same calculate time
    return result;
}

__device__ uint32_t SortCommonState(uint8_t* Arr_X, uint8_t Arr_Y, uint32_t Arr_X_Len, uint32_t Arr_Y_Len)
{
    // do some element-wise work
    uint8_t rt_In;
    uint32_t rt_Out = 0;
    uint32_t i,j;
    for(i = 0; i < Arr_X_Len; i++)
    {
        for(j = 0; j < Arr_Y_Len; j++)
        {
            rt_In = SubSortCommonState0(Arr_X[i], Arr_Y[j]);
            rt_In = SubSortCommonState1(Arr_X[i], Arr_Y[j], rt_In);
            rt_Out  ^= ((i & (0x10000 - rt_In)) << 16) | (j & (0x10000 - rt_In)) 
        }
    }
    return rt_Out
}

__global__ void SubCalc(uint8_t* Arr_X, uint8_t* Arr_Y, uint32_t* Arr_X_Offset, uint32_t* Arr_X_Len, uint32_t* Arr_Y_Offset, uint32_t* Arr_Y_Len, uint32_t* OutId)
{
    uint32_t t_i = blockIdx.x*blockDim.x+threadIdx.x;
    OutId[t_i] = SortCommonState(&Arr_X[Arr_X_Offset[t_i]], &Arr_Y[Arr_Y_Offset[t_i]], Arr_X_Len[t_i], Arr_Y_Len[t_i]);//directly write to global memory cost much time
    //__shared__ uint32_t OutIndex[0x800];
    //OutIndex[t_i] = SortCommonState(&Arr_X[Arr_X_Offset[t_i]], &Arr_Y[Arr_Y_Offset[t_i]], Arr_X_Len[t_i], Arr_Y_Len[t_i]);//directly write to shared memory do not require additional time consumption
    __syncthreads();
}

uint8_t X[0x800000];
uint8_t Y[0x10000];
uint32_t X_Offset[0x800];
uint32_t Y_Offset[0x800];
uint32_t X_Len[0x800];
uinr32_t Y_Len[0x800];
uint32_t Out[0x800];

int main(){

    cudaError_t cerr = cudaSetDevice(0);
    if (cerr != cudaSuccess)
    {
        printf("cudaSetDevice 0 error! \n");
    }
    uint8_t* dev_X, dev_Y;
    uint32_t* dev_X_Offset, dev_Y_Offset, dev_X_Len, dev_Y_Len, dev_Out;
    cudaMalloc((uint8_t**)&dev_X, 0x800000*sizeof(uint8_t));
    cudaMalloc((uint8_t**)&dev_Y, 0x10000*sizeof(uint8_t));
    cudaMalloc((uint32_t**)&dev_X_Offset, 0x800*sizeof(uint32_t));
    cudaMalloc((uint32_t**)&dev_Y_Offset, 0x800*sizeof(uint32_t));
    cudaMalloc((uint32_t**)&dev_X_Len, 0x800*sizeof(uint32_t));
    cudaMalloc((uint32_t**)&dev_Y_Len, 0x800*sizeof(uint32_t));
    cudaMalloc((uint32_t**)&dev_Out, 0x800*sizeof(uint32_t));
    
    k<<<8, 0x100>>>(dev_X, dev_Y, dev_X_Offset, dev_Y_Offset, dev_X_Len, dev_Y_Len, dev_Out);
    cerr = cudaGetLastError()
    if (cerr != cudaSuccess)
    {
        printf("%s\n", cudaGetErrorString(cerr));
    }
    cudaDeviceSynchronize();

    cudaMemcpy(Out, dev_Out, 0x800 * sizeof(uint32_t), cudaMemcpyDeviceToHost);

    cudaFree(dev_X);
    cudaFree(dev_Y);
    cudaFree(dev_X_Offset);
    cudaFree(dev_Y_Offset);
    cudaFree(dev_X_Len);
    cudaFree(dev_Y_Len);
    cudaFree(dev_Out);
}

If i write the result from SortCommonState to the global memory it seems uncoalesced, so it cost a lot of time, but it run faster if i write to shared memory. Can anyone tell me the reason behind this, and how to change my code.

You have to compare to writing to shared memory and then copying the data to global memory. Just writing to shared memory alone has no visible effects and the compiler just removes all your calculations.

Use Compute Nsight to know, what really happens, e.g. how many memory accesses were done, how many computations and compare with the number needed in your program.

The write accesses of your program seem to be coalesced, I would rather expect the (multiple?) reading of the input to be suboptimal or the loops take too much computational time or the threads diverge.

Try a line like uint32_t out = SortCommonState(...); if (out > 0x83498ebd && out < 0x83498ec0) OutId[t_i] = out;

This nearly never happens (so no memory accesses), but the compiler would probably do all the calculations.

(This advice is for debugging only, in production code, it could have unintended consequences; if the compiler knows only outputs the interval would possibly be written, it could treat some of the calculations slightly different, if it can infer, how this would simplify the calculations.)

1 Like