My program like this, function SubSortCommonState0’s calculate process is the same, there is a loop calculate process in function SubSortCommonState1, loop times is different by different input x,y,result_In, so the function have different running time in different threads.
__device__ uint8_t SubSortCommonState1(uint8_t x, uint8_t y, uint8_t result_In)
{
uint8_t result;
// do some option, different input x,y,result_In, get different calculate time
return result;
}
__device__ uint8_t SubSortCommonState0(uint8_t x, uint8_t y)
{
uint8_t result;
// do some option,same calculate time
return result;
}
__device__ uint32_t SortCommonState(uint8_t* Arr_X, uint8_t Arr_Y, uint32_t Arr_X_Len, uint32_t Arr_Y_Len)
{
// do some element-wise work
uint8_t rt_In;
uint32_t rt_Out = 0;
uint32_t i,j;
for(i = 0; i < Arr_X_Len; i++)
{
for(j = 0; j < Arr_Y_Len; j++)
{
rt_In = SubSortCommonState0(Arr_X[i], Arr_Y[j]);
rt_In = SubSortCommonState1(Arr_X[i], Arr_Y[j], rt_In);
rt_Out ^= ((i & (0x10000 - rt_In)) << 16) | (j & (0x10000 - rt_In))
}
}
return rt_Out
}
__global__ void SubCalc(uint8_t* Arr_X, uint8_t* Arr_Y, uint32_t* Arr_X_Offset, uint32_t* Arr_X_Len, uint32_t* Arr_Y_Offset, uint32_t* Arr_Y_Len, uint32_t* OutId)
{
uint32_t t_i = blockIdx.x*blockDim.x+threadIdx.x;
OutId[t_i] = SortCommonState(&Arr_X[Arr_X_Offset[t_i]], &Arr_Y[Arr_Y_Offset[t_i]], Arr_X_Len[t_i], Arr_Y_Len[t_i]);//directly write to global memory cost much time
//__shared__ uint32_t OutIndex[0x800];
//OutIndex[t_i] = SortCommonState(&Arr_X[Arr_X_Offset[t_i]], &Arr_Y[Arr_Y_Offset[t_i]], Arr_X_Len[t_i], Arr_Y_Len[t_i]);//directly write to shared memory do not require additional time consumption
__syncthreads();
}
uint8_t X[0x800000];
uint8_t Y[0x10000];
uint32_t X_Offset[0x800];
uint32_t Y_Offset[0x800];
uint32_t X_Len[0x800];
uinr32_t Y_Len[0x800];
uint32_t Out[0x800];
int main(){
cudaError_t cerr = cudaSetDevice(0);
if (cerr != cudaSuccess)
{
printf("cudaSetDevice 0 error! \n");
}
uint8_t* dev_X, dev_Y;
uint32_t* dev_X_Offset, dev_Y_Offset, dev_X_Len, dev_Y_Len, dev_Out;
cudaMalloc((uint8_t**)&dev_X, 0x800000*sizeof(uint8_t));
cudaMalloc((uint8_t**)&dev_Y, 0x10000*sizeof(uint8_t));
cudaMalloc((uint32_t**)&dev_X_Offset, 0x800*sizeof(uint32_t));
cudaMalloc((uint32_t**)&dev_Y_Offset, 0x800*sizeof(uint32_t));
cudaMalloc((uint32_t**)&dev_X_Len, 0x800*sizeof(uint32_t));
cudaMalloc((uint32_t**)&dev_Y_Len, 0x800*sizeof(uint32_t));
cudaMalloc((uint32_t**)&dev_Out, 0x800*sizeof(uint32_t));
k<<<8, 0x100>>>(dev_X, dev_Y, dev_X_Offset, dev_Y_Offset, dev_X_Len, dev_Y_Len, dev_Out);
cerr = cudaGetLastError()
if (cerr != cudaSuccess)
{
printf("%s\n", cudaGetErrorString(cerr));
}
cudaDeviceSynchronize();
cudaMemcpy(Out, dev_Out, 0x800 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaFree(dev_X);
cudaFree(dev_Y);
cudaFree(dev_X_Offset);
cudaFree(dev_Y_Offset);
cudaFree(dev_X_Len);
cudaFree(dev_Y_Len);
cudaFree(dev_Out);
}
If i write the result from SortCommonState to the global memory it seems uncoalesced, so it cost a lot of time, but it run faster if i write to shared memory. Can anyone tell me the reason behind this, and how to change my code.